Skip to content

Commit bbc8e9f

Browse files
authored
Merge pull request #210 from mdboom/estimate-sample-size
Add warnings about too few or too many samples
2 parents c584266 + a02bff9 commit bbc8e9f

File tree

5 files changed

+98
-8
lines changed

5 files changed

+98
-8
lines changed

doc/api.rst

+13
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,19 @@ Benchmark class
345345

346346
Raise an exception if the benchmark has no values.
347347

348+
.. method:: required_nprocesses()
349+
350+
Determines the number of separate process runs that would be required
351+
achieve stable results. Specifically, the target is to have 95% certainty
352+
that there is a variance of less than 1%. If the result is greater than
353+
the number of processes recorded in the input data, the value is
354+
meaningless and only means "more samples are required".
355+
356+
The method used is described in this Wikipedia article about estimating
357+
the sampling of a mean:
358+
359+
https://en.wikipedia.org/wiki/Sample_size_determination#Estimation_of_a_mean
360+
348361
.. method:: update_metadata(metadata: dict)
349362

350363
Update metadata of all runs of the benchmark.

pyperf/__main__.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -455,7 +455,8 @@ def display_benchmarks(args, show_metadata=False, hist=False, stats=False,
455455
dump=dump,
456456
checks=checks,
457457
result=result,
458-
display_runs_args=display_runs_args)
458+
display_runs_args=display_runs_args,
459+
only_checks=only_checks)
459460

460461
if bench_lines:
461462
empty_line(lines)
@@ -491,10 +492,13 @@ def display_benchmarks(args, show_metadata=False, hist=False, stats=False,
491492
empty_line(output)
492493
output.extend(lines)
493494

495+
contains_warning = False
494496
for line in output:
497+
if line.startswith("WARNING:"):
498+
contains_warning = True
495499
print(line)
496500

497-
if not output and only_checks:
501+
if not contains_warning and only_checks:
498502
if len(data) == 1:
499503
print("The benchmark seems to be stable")
500504
else:

pyperf/_bench.py

+41
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,47 @@ def median_abs_dev(self):
424424
raise ValueError("MAD must be >= 0")
425425
return value
426426

427+
def required_nprocesses(self):
428+
"""
429+
Determines the number of separate process runs that would be required
430+
achieve stable results. Specifically, the target is to have 95%
431+
certainty that there is a variance of less than 1%. If the result is
432+
greater than the number of processes recorded in the input data, the
433+
value is meaningless and only means "more samples are required".
434+
435+
The method used is described in this Wikipedia article about estimating
436+
the sampling of a mean:
437+
438+
https://en.wikipedia.org/wiki/Sample_size_determination#Estimation_of_a_mean
439+
"""
440+
# Get the means of the values per process. The values within the process
441+
# often vary considerably (e.g. due to cache effects), but the variances
442+
# between processes should be fairly consistent. Additionally, this
443+
# value is intended to be advice for the number of processes to run.
444+
values = []
445+
for run in self._runs:
446+
if len(run.values):
447+
values.append(statistics.mean(run.values))
448+
449+
if len(values) < 2:
450+
return None
451+
452+
total = math.fsum(values)
453+
mean = total / len(values)
454+
stddev = statistics.stdev(values)
455+
456+
# Normalize the stddev so we can target "percentage changed" rather than
457+
# absolute time
458+
sigma = stddev / mean
459+
460+
# 95% certainty
461+
Z = 1.96
462+
# 1% variation
463+
W = 0.01
464+
465+
# (4Z²σ²)/(W²)
466+
return math.ceil((4 * Z ** 2 * sigma ** 2) / (W ** 2))
467+
427468
def percentile(self, p):
428469
if not (0 <= p <= 100):
429470
raise ValueError("p must be in the range [0; 100]")

pyperf/_cli.py

+25-3
Original file line numberDiff line numberDiff line change
@@ -400,7 +400,7 @@ def value_bucket(value):
400400
return lines
401401

402402

403-
def format_checks(bench, lines=None):
403+
def format_checks(bench, lines=None, check_too_many_processes=False):
404404
if lines is None:
405405
lines = []
406406

@@ -412,6 +412,7 @@ def format_checks(bench, lines=None):
412412
mean = bench.mean()
413413
warnings = []
414414
warn = warnings.append
415+
required_nprocesses = None
415416

416417
# Display a warning if the standard deviation is greater than 10%
417418
# of the mean
@@ -421,6 +422,14 @@ def format_checks(bench, lines=None):
421422
if percent >= 10.0:
422423
warn("the standard deviation (%s) is %.0f%% of the mean (%s)"
423424
% (bench.format_value(stdev), percent, bench.format_value(mean)))
425+
else:
426+
# display a warning if the number of samples isn't enough to get a stable result
427+
required_nprocesses = bench.required_nprocesses()
428+
if (
429+
required_nprocesses is not None and
430+
required_nprocesses > len(bench._runs)
431+
):
432+
warn("Not enough samples to get a stable result (95% certainly of less than 1% variation)")
424433

425434
# Minimum and maximum, detect obvious outliers
426435
for minimum, value in (
@@ -457,6 +466,19 @@ def format_checks(bench, lines=None):
457466
lines.append("Use pyperf stats, pyperf dump and pyperf hist to analyze results.")
458467
lines.append("Use --quiet option to hide these warnings.")
459468

469+
if check_too_many_processes:
470+
if required_nprocesses is None:
471+
required_nprocesses = bench.required_nprocesses()
472+
if (
473+
required_nprocesses is not None and
474+
required_nprocesses < len(bench._runs) * 0.75
475+
):
476+
lines.append("Benchmark was run more times than necessary to get a stable result.")
477+
lines.append(
478+
"Consider passing processes=%d to the Runner constructor to save time." %
479+
required_nprocesses
480+
)
481+
460482
# Warn if nohz_full+intel_pstate combo if found in cpu_config metadata
461483
for run in bench._runs:
462484
cpu_config = run._metadata.get('cpu_config')
@@ -549,7 +571,7 @@ def format_result(bench):
549571

550572
def format_benchmark(bench, checks=True, metadata=False,
551573
dump=False, stats=False, hist=False, show_name=False,
552-
result=True, display_runs_args=None):
574+
result=True, display_runs_args=None, only_checks=False):
553575
lines = []
554576

555577
if metadata:
@@ -568,7 +590,7 @@ def format_benchmark(bench, checks=True, metadata=False,
568590
format_stats(bench, lines=lines)
569591

570592
if checks:
571-
format_checks(bench, lines=lines)
593+
format_checks(bench, lines=lines, check_too_many_processes=only_checks)
572594

573595
if result:
574596
empty_line(lines)

pyperf/tests/test_perf_cli.py

+13-3
Original file line numberDiff line numberDiff line change
@@ -628,8 +628,18 @@ def test_slowest(self):
628628

629629
def test_check_stable(self):
630630
stdout = self.run_command('check', TELCO)
631-
self.assertEqual(stdout.rstrip(),
632-
'The benchmark seems to be stable')
631+
self.assertIn(
632+
textwrap.dedent(
633+
"""
634+
Benchmark was run more times than necessary to get a stable result.
635+
Consider passing processes=7 to the Runner constructor to save time.
636+
"""
637+
).strip(), stdout.rstrip()
638+
)
639+
self.assertIn(
640+
'The benchmark seems to be stable',
641+
stdout.rstrip()
642+
)
633643

634644
def test_command(self):
635645
command = [sys.executable, '-c', 'pass']
@@ -689,7 +699,7 @@ def _check_track_memory(self, track_option):
689699
'[1,2]*1000',
690700
'-o', tmp_name)
691701
bench = pyperf.Benchmark.load(tmp_name)
692-
702+
693703
self._check_track_memory_bench(bench, loops=5)
694704

695705
def test_track_memory(self):

0 commit comments

Comments
 (0)