Skip to content

Commit f227162

Browse files
committed
benchmarks: import jetscii's benchmarks
There was some discussion about how to compare jetscii with Teddy and some interesting benchmark results[1]. I decided to import the benchmarks and see what things look like here. [1]: shepmaster/jetscii#57
1 parent 964a2d5 commit f227162

File tree

13 files changed

+161270
-4
lines changed

13 files changed

+161270
-4
lines changed

.vim/coc-settings.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
"aho-corasick-debug/Cargo.toml",
44
"benchmarks/engines/rust-aho-corasick/Cargo.toml",
55
"benchmarks/engines/rust-daachorse/Cargo.toml",
6+
"benchmarks/engines/rust-jetscii/Cargo.toml",
67
"benchmarks/engines/naive/Cargo.toml",
78
"benchmarks/shared/Cargo.toml",
89
"fuzz/Cargo.toml",

benchmarks/definitions/jetscii.toml

Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
analysis = '''
2+
These benchmarks were ported out of the jetscii crate, specifically from
3+
[Dr-Emann's PR]. There were some irregularities in the benchmark results, so
4+
I thought it might be interesting to include it here.
5+
6+
We add "real" variants of each benchmark as well using a small XML data set on
7+
mental health. The original benchmarks search a haystack consisting entirely of
8+
`a` repeated, with the last byte corresponding to one of the needle bytes. This
9+
is useful for measuring pure throughput, but less good for approximating real
10+
world performance. In this case, for at least `xml-delim3` and `xml-delim5`, it
11+
seems like an XML haystack would be better suited.
12+
13+
[Dr-Emann's PR]: https://github.com/shepmaster/jetscii/pull/57
14+
'''
15+
16+
[[bench]]
17+
model = "count"
18+
name = "space-repeateda"
19+
regex = [' ']
20+
haystack = { contents = "a", repeat = 5_242_880, append = " " }
21+
count = 1
22+
engines = [
23+
"daachorse/bytewise/leftmost-first",
24+
"rust/aho-corasick/dfa/leftmost-first",
25+
"rust/aho-corasick/packed/leftmost-first",
26+
"rust/old-aho-corasick/packed/leftmost-first",
27+
"rust/jetscii/ascii-chars/prebuilt",
28+
]
29+
30+
[[bench]]
31+
model = "count"
32+
name = "xmldelim3-repeateda"
33+
regex = ['<', '>', '&']
34+
haystack = { contents = "a", repeat = 5_242_880, append = "&" }
35+
count = 1
36+
engines = [
37+
"daachorse/bytewise/leftmost-first",
38+
"rust/aho-corasick/dfa/leftmost-first",
39+
"rust/aho-corasick/packed/leftmost-first",
40+
"rust/old-aho-corasick/packed/leftmost-first",
41+
"rust/jetscii/ascii-chars/prebuilt",
42+
]
43+
44+
[[bench]]
45+
model = "count"
46+
name = "xmldelim5-repeateda"
47+
regex = ['<', '>', '&', "'", '"']
48+
haystack = { contents = "a", repeat = 5_242_880, append = '"' }
49+
count = 1
50+
engines = [
51+
"daachorse/bytewise/leftmost-first",
52+
"rust/aho-corasick/dfa/leftmost-first",
53+
"rust/aho-corasick/packed/leftmost-first",
54+
"rust/old-aho-corasick/packed/leftmost-first",
55+
"rust/jetscii/ascii-chars/prebuilt",
56+
]
57+
58+
[[bench]]
59+
model = "count"
60+
name = "big16-repeateda"
61+
regex = [
62+
'A', 'B', 'C', 'D',
63+
'E', 'F', 'G', 'H',
64+
'I', 'J', 'K', 'L',
65+
'M', 'N', 'O', 'P',
66+
]
67+
haystack = { contents = "a", repeat = 5_242_880, append = "P" }
68+
count = 1
69+
engines = [
70+
"daachorse/bytewise/leftmost-first",
71+
"rust/aho-corasick/dfa/leftmost-first",
72+
"rust/aho-corasick/packed/leftmost-first",
73+
"rust/old-aho-corasick/packed/leftmost-first",
74+
"rust/jetscii/ascii-chars/prebuilt",
75+
]
76+
77+
[[bench]]
78+
model = "count"
79+
name = "big16earlyshort-repeateda"
80+
regex = [
81+
'A', 'B', 'C', 'D',
82+
'E', 'F', 'G', 'H',
83+
'I', 'J', 'K', 'L',
84+
'M', 'N', 'O', 'P',
85+
]
86+
haystack = { contents = "Pa" }
87+
count = 1
88+
engines = [
89+
"daachorse/bytewise/leftmost-first",
90+
"rust/aho-corasick/dfa/leftmost-first",
91+
"rust/aho-corasick/packed/leftmost-first",
92+
"rust/old-aho-corasick/packed/leftmost-first",
93+
"rust/jetscii/ascii-chars/prebuilt",
94+
]
95+
96+
[[bench]]
97+
model = "count"
98+
name = "big16earlylong-repeateda"
99+
regex = [
100+
'A', 'B', 'C', 'D',
101+
'E', 'F', 'G', 'H',
102+
'I', 'J', 'K', 'L',
103+
'M', 'N', 'O', 'P',
104+
]
105+
haystack = { contents = "a", repeat = 14, append = "P" }
106+
count = 1
107+
engines = [
108+
"daachorse/bytewise/leftmost-first",
109+
"rust/aho-corasick/dfa/leftmost-first",
110+
"rust/aho-corasick/packed/leftmost-first",
111+
"rust/old-aho-corasick/packed/leftmost-first",
112+
"rust/jetscii/ascii-chars/prebuilt",
113+
]
114+
115+
[[bench]]
116+
model = "count"
117+
name = "space-mentalhealth"
118+
regex = [' ']
119+
haystack = { path = "catalog.data.gov/mental-health-4weeks.xml" }
120+
count = 1_181_201
121+
engines = [
122+
"daachorse/bytewise/leftmost-first",
123+
"rust/aho-corasick/dfa/leftmost-first",
124+
"rust/aho-corasick/packed/leftmost-first",
125+
"rust/old-aho-corasick/packed/leftmost-first",
126+
"rust/jetscii/ascii-chars/prebuilt",
127+
]
128+
129+
[[bench]]
130+
model = "count"
131+
name = "xmldelim3-mentalhealth"
132+
regex = ['<', '>', '&']
133+
haystack = { path = "catalog.data.gov/mental-health-4weeks.xml" }
134+
count = 604_714
135+
engines = [
136+
"daachorse/bytewise/leftmost-first",
137+
"rust/aho-corasick/dfa/leftmost-first",
138+
"rust/aho-corasick/packed/leftmost-first",
139+
"rust/old-aho-corasick/packed/leftmost-first",
140+
"rust/jetscii/ascii-chars/prebuilt",
141+
]
142+
143+
[[bench]]
144+
model = "count"
145+
name = "xmldelim5-mentalhealth"
146+
regex = ['<', '>', '&', "'", '"']
147+
haystack = { path = "catalog.data.gov/mental-health-4weeks.xml" }
148+
count = 688_252
149+
engines = [
150+
"daachorse/bytewise/leftmost-first",
151+
"rust/aho-corasick/dfa/leftmost-first",
152+
"rust/aho-corasick/packed/leftmost-first",
153+
"rust/old-aho-corasick/packed/leftmost-first",
154+
"rust/jetscii/ascii-chars/prebuilt",
155+
]
156+
157+
[[bench]]
158+
model = "count"
159+
name = "big16-mentalhealth"
160+
regex = [
161+
'A', 'B', 'C', 'D',
162+
'E', 'F', 'G', 'H',
163+
'I', 'J', 'K', 'L',
164+
'M', 'N', 'O', 'P',
165+
]
166+
haystack = { path = "catalog.data.gov/mental-health-4weeks.xml" }
167+
count = 176_447
168+
engines = [
169+
"daachorse/bytewise/leftmost-first",
170+
"rust/aho-corasick/dfa/leftmost-first",
171+
"rust/aho-corasick/packed/leftmost-first",
172+
"rust/old-aho-corasick/packed/leftmost-first",
173+
"rust/jetscii/ascii-chars/prebuilt",
174+
]

benchmarks/engines.toml

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -317,6 +317,42 @@
317317
bin = "cargo"
318318
args = ["clean"]
319319

320+
# Engines based on the `jetscii` Rust crate. This is somewhat more appropriately
321+
# compared with routines in `memchr`, but there is some overlap in use cases
322+
# with Teddy's packed searcher for multiple single-byte needles.
323+
324+
[[engine]]
325+
name = "rust/jetscii/ascii-chars/prebuilt"
326+
cwd = "./engines/rust-jetscii"
327+
[engine.version]
328+
bin = "./target/release/main"
329+
args = ["--version"]
330+
[engine.run]
331+
bin = "./target/release/main"
332+
args = ["ascii-chars-prebuilt"]
333+
[[engine.build]]
334+
bin = "cargo"
335+
args = ["build", "--release"]
336+
[[engine.clean]]
337+
bin = "cargo"
338+
args = ["clean"]
339+
340+
[[engine]]
341+
name = "rust/jetscii/ascii-chars/oneshot"
342+
cwd = "./engines/rust-jetscii"
343+
[engine.version]
344+
bin = "./target/release/main"
345+
args = ["--version"]
346+
[engine.run]
347+
bin = "./target/release/main"
348+
args = ["ascii-chars-oneshot"]
349+
[[engine.build]]
350+
bin = "cargo"
351+
args = ["build", "--release"]
352+
[[engine.clean]]
353+
bin = "cargo"
354+
args = ["clean"]
355+
320356
# Naive engines. Useful for comparisons and to determine the crossover point
321357
# where a multi-substring algorithm is beneficial over multi single-substring
322358
# algorithms. We include both the `memchr` crate and `std`.

benchmarks/engines/naive/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,4 +18,4 @@ path = "../../shared"
1818
[profile.release]
1919
debug = true
2020
codegen-units = 1
21-
lto = "thin"
21+
lto = "fat"

benchmarks/engines/rust-aho-corasick/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,4 +21,4 @@ path = "../../shared"
2121
[profile.release]
2222
debug = true
2323
codegen-units = 1
24-
lto = "thin"
24+
lto = "fat"

benchmarks/engines/rust-daachorse/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,4 +18,4 @@ path = "../../shared"
1818
[profile.release]
1919
debug = true
2020
codegen-units = 1
21-
lto = "thin"
21+
lto = "fat"

benchmarks/engines/rust-jetscii/Cargo.lock

Lines changed: 54 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
[package]
2+
publish = false
3+
name = "main"
4+
version = "0.5.3"
5+
edition = "2021"
6+
7+
[workspace]
8+
9+
[dependencies]
10+
anyhow = "1.0.72"
11+
jetscii = "=0.5.3"
12+
13+
[dependencies.shared]
14+
path = "../../shared"
15+
16+
[[bin]]
17+
name = "main"
18+
path = "main.rs"
19+
20+
[profile.release]
21+
debug = true
22+
codegen-units = 1
23+
lto = "fat"

0 commit comments

Comments
 (0)