Skip to content

Commit d89b277

Browse files
committed
Add the corpus benchmark
Measure the corpus parse rate and end-to-end (lex + parse) throughput over the MySQL server test corpus, with warmup + timed passes (best/median, JIT detection); --inline-units benchmarks the collapsed-AST mode. On the corpus the parser accepts 99.88% of the ~69.5k queries.
1 parent 6073a90 commit d89b277

3 files changed

Lines changed: 173 additions & 0 deletions

File tree

packages/mysql-parser/README.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,18 @@ runs Bison in Docker, and rewrites `src/grammar/parse-table.php` and
107107
byte for byte. Both artifacts are plain PHP arrays. The fetched sources and the (large) automaton dump land in
108108
`build/`, which is gitignored.
109109

110+
## Benchmark
111+
112+
```bash
113+
composer run benchmark # corpus throughput, without and with the tracing JIT
114+
```
115+
116+
The benchmark runs a ~69.5k-query corpus of MySQL server test queries from
117+
the monorepo's shared test data. The parser accepts **99.88%** of it; the 0.12%
118+
it rejects is syntax removed in MySQL 8.4 (e.g. `RESET MASTER`),
119+
multi-statement input, statements needing non-default session SQL modes, and a
120+
few lexer edge cases.
121+
110122
## Pinned MySQL version
111123

112124
The grammar is pinned to **`mysql-8.4.3`** (the version the committed artifacts

packages/mysql-parser/composer.json

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@
99
"scripts": {
1010
"build-grammar": [
1111
"./bin/build-grammar"
12+
],
13+
"benchmark": [
14+
"@php tests/benchmark.php",
15+
"@php -d opcache.enable_cli=1 -d opcache.jit_buffer_size=64M -d opcache.jit=tracing tests/benchmark.php"
1216
]
1317
}
1418
}
Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
<?php
2+
/**
3+
* Benchmark the MySQL parser over the MySQL server test corpus.
4+
*
5+
* Reports the corpus parse rate and end-to-end (lex + parse) throughput.
6+
*
7+
* Methodology: a few warmup passes (discarded — they heat opcache, the tracing
8+
* JIT, and the CPU caches) followed by N timed passes over the whole corpus. The
9+
* headline is the BEST pass: parsing is deterministic and CPU-bound, so outside
10+
* interference only ever makes a pass slower, which makes the fastest pass the
11+
* most reproducible estimate. A single cold pass badly under-reports the tracing
12+
* JIT (it pays compilation inside the timed run), so warmup is on by default.
13+
*
14+
* Options:
15+
* --json Machine-readable output.
16+
* --limit=N Only benchmark the first N queries.
17+
* --iterations=N Number of timed passes (default 5).
18+
* --warmup=N Number of discarded warmup passes (default 2).
19+
* --corpus=PATH Path to the queries CSV (default: the mysql-on-sqlite corpus).
20+
* --inline-units Parse with unit-production inlining (the collapsed AST).
21+
*/
22+
23+
set_error_handler(
24+
function ( $severity, $message, $file, $line ) {
25+
throw new ErrorException( $message, 0, $severity, $file, $line );
26+
}
27+
);
28+
29+
$json = in_array( '--json', $argv, true );
30+
$inline_units = in_array( '--inline-units', $argv, true );
31+
$limit = null;
32+
$iterations = 5;
33+
$warmup = 2;
34+
$corpus = __DIR__ . '/../../mysql-on-sqlite/tests/mysql/data/mysql-server-tests-queries.csv';
35+
foreach ( $argv as $arg ) {
36+
if ( 0 === strpos( $arg, '--limit=' ) ) {
37+
$limit = max( 1, (int) substr( $arg, strlen( '--limit=' ) ) );
38+
} elseif ( 0 === strpos( $arg, '--iterations=' ) ) {
39+
$iterations = max( 1, (int) substr( $arg, strlen( '--iterations=' ) ) );
40+
} elseif ( 0 === strpos( $arg, '--warmup=' ) ) {
41+
$warmup = max( 0, (int) substr( $arg, strlen( '--warmup=' ) ) );
42+
} elseif ( 0 === strpos( $arg, '--corpus=' ) ) {
43+
$corpus = substr( $arg, strlen( '--corpus=' ) );
44+
}
45+
}
46+
47+
require_once __DIR__ . '/../src/load.php';
48+
$parser = new WP_MySQL_Parser( require __DIR__ . '/../src/grammar/parse-table.php', $inline_units );
49+
50+
// Load the corpus before timing so file IO is excluded.
51+
if ( ! is_readable( $corpus ) ) {
52+
fwrite( STDERR, "error: corpus not found at $corpus (pass --corpus=PATH).\n" );
53+
exit( 1 );
54+
}
55+
$handle = fopen( $corpus, 'r' );
56+
$queries = array();
57+
while ( ( $record = fgetcsv( $handle, null, ',', '"', '\\' ) ) !== false ) {
58+
$query = $record[0] ?? null;
59+
if ( null === $query || '' === $query ) {
60+
continue;
61+
}
62+
$queries[] = $query;
63+
if ( null !== $limit && count( $queries ) >= $limit ) {
64+
break;
65+
}
66+
}
67+
fclose( $handle );
68+
$query_count = count( $queries );
69+
70+
// One end-to-end pass over the corpus (lex + parse), recording failures and
71+
// exceptions (deterministic across passes, so the last pass's counts are kept).
72+
$failures = 0;
73+
$exceptions = 0;
74+
$parse_corpus = function () use ( $queries, $parser, &$failures, &$exceptions ) {
75+
$failures = 0;
76+
$exceptions = 0;
77+
foreach ( $queries as $query ) {
78+
try {
79+
$tokens = ( new WP_MySQL_Lexer( $query ) )->remaining_tokens();
80+
if ( null === $parser->parse( $tokens ) ) {
81+
++$failures;
82+
}
83+
} catch ( Throwable $e ) {
84+
++$exceptions;
85+
}
86+
}
87+
};
88+
89+
for ( $i = 0; $i < $warmup; $i++ ) {
90+
$parse_corpus();
91+
}
92+
93+
$samples = array();
94+
for ( $i = 0; $i < $iterations; $i++ ) {
95+
$start = microtime( true );
96+
$parse_corpus();
97+
$samples[] = $query_count / ( microtime( true ) - $start );
98+
}
99+
sort( $samples );
100+
101+
$best = $samples[ count( $samples ) - 1 ];
102+
$worst = $samples[0];
103+
$mean = array_sum( $samples ) / count( $samples );
104+
$mid = intdiv( count( $samples ), 2 );
105+
$median = 0 === count( $samples ) % 2
106+
? ( $samples[ $mid - 1 ] + $samples[ $mid ] ) / 2
107+
: $samples[ $mid ];
108+
$spread = $best > 0 ? ( $best - $worst ) / $best : 0.0;
109+
110+
$opcache_status = function_exists( 'opcache_get_status' ) ? opcache_get_status( false ) : false;
111+
$opcache_on = is_array( $opcache_status );
112+
$jit_on = $opcache_on && ! empty( $opcache_status['jit']['on'] );
113+
114+
if ( $json ) {
115+
echo json_encode(
116+
array(
117+
'benchmark' => 'mysql-parser',
118+
'inline_units' => $inline_units,
119+
'opcache' => $opcache_on,
120+
'jit' => $jit_on,
121+
'queries' => $query_count,
122+
'warmup' => $warmup,
123+
'iterations' => $iterations,
124+
'qps' => $best,
125+
'qps_best' => $best,
126+
'qps_median' => $median,
127+
'qps_mean' => $mean,
128+
'qps_worst' => $worst,
129+
'spread' => $spread,
130+
'failures' => $failures,
131+
'exceptions' => $exceptions,
132+
'php_version' => PHP_VERSION,
133+
),
134+
JSON_PRETTY_PRINT | JSON_UNESCAPED_SLASHES
135+
), "\n";
136+
exit;
137+
}
138+
139+
$config = $jit_on ? 'opcache + tracing JIT' : ( $opcache_on ? 'opcache, no JIT' : 'no opcache' );
140+
printf( "MySQL parser (official 8.4 grammar%s) — %s\n", $inline_units ? ', unit inlining' : '', $config );
141+
$jit_requested = ! in_array( strtolower( (string) ini_get( 'opcache.jit' ) ), array( '', '0', 'off', 'disable' ), true );
142+
if ( $jit_requested && ! $jit_on ) {
143+
printf( " warning: opcache.jit is set but the JIT is NOT active here — check that opcache is enabled and jit_buffer_size > 0.\n" );
144+
}
145+
printf( "%s queries, %d warmup + %d timed passes (end-to-end lex+parse)\n", number_format( $query_count ), $warmup, $iterations );
146+
printf( " best: %s QPS\n", number_format( $best ) );
147+
printf( " median: %s QPS\n", number_format( $median ) );
148+
printf( " spread: %.1f%% (best vs worst)\n", $spread * 100 );
149+
printf(
150+
" failures: %d (%.2f%%) | exceptions: %d\n",
151+
$failures,
152+
$query_count > 0 ? $failures / $query_count * 100 : 0.0,
153+
$exceptions
154+
);
155+
if ( $spread > 0.10 ) {
156+
printf( " note: >10%% spread — the machine is noisy; close other apps for a steadier number.\n" );
157+
}

0 commit comments

Comments
 (0)