Skip to content

Commit 6d5a875

Browse files
authored
Better result reporting (#655)
* Make local db more resilient to bad records * warning about volatile db in docs * test new job manager behavior * address copilot comments * Better result reporting * better results in upload * one unnecessary elif
1 parent f123097 commit 6d5a875

20 files changed

+106
-86
lines changed

metriq_gym/benchmarks/benchmark.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import argparse
22
from typing import Iterable, TYPE_CHECKING, Protocol
3-
from abc import ABC, abstractmethod
3+
from abc import ABC
44

55
from pydantic import BaseModel, computed_field
66
from dataclasses import dataclass
@@ -39,6 +39,14 @@ class BenchmarkScore(BaseModel):
3939
# If not specified, treat uncertainty as not available (N/A)
4040
uncertainty: float | None = None
4141

42+
def __str__(self) -> str:
43+
if self.uncertainty is None or self.uncertainty == "":
44+
return str(self.value)
45+
return f"{self.value} ± {self.uncertainty}"
46+
47+
def __repr__(self) -> str:
48+
return f"BenchmarkScore(value={self.value}, uncertainty={self.uncertainty})"
49+
4250

4351
class BenchmarkResult(BaseModel, ABC):
4452
"""Base class for benchmark results.
@@ -50,9 +58,10 @@ class BenchmarkResult(BaseModel, ABC):
5058

5159
def _iter_metric_items(self):
5260
for name in self.__class__.model_fields:
61+
if name in self.__class__.model_computed_fields:
62+
continue
5363
value = getattr(self, name, None)
5464
if isinstance(value, BenchmarkScore):
55-
# If uncertainty is not provided, leave as None
5665
u = value.uncertainty
5766
yield name, float(value.value), (float(u) if u is not None else None)
5867
elif isinstance(value, (int, float)):
@@ -67,17 +76,16 @@ def values(self) -> dict[str, float]:
6776
def uncertainties(self) -> dict[str, float | None]:
6877
return {name: uncertainty for name, _, uncertainty in self._iter_metric_items()}
6978

70-
@abstractmethod
71-
def compute_score(self) -> float | None:
79+
def compute_score(self) -> BenchmarkScore | None:
7280
"""Hook for computing a scalar score from result metrics.
7381
7482
Default implementation returns None. Benchmarks should override this to
7583
implement single- or multi-metric scoring as appropriate.
7684
"""
77-
...
85+
return None
7886

79-
@computed_field(return_type=float | None)
80-
def score(self) -> float | None:
87+
@computed_field(return_type=BenchmarkScore | None)
88+
def score(self) -> BenchmarkScore | None:
8189
return self.compute_score()
8290

8391

metriq_gym/benchmarks/bseq.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
Benchmark,
2020
BenchmarkData,
2121
BenchmarkResult,
22+
BenchmarkScore,
2223
)
2324
from metriq_gym.helpers.task_helpers import flatten_counts
2425
from metriq_gym.helpers.graph_helpers import (
@@ -38,8 +39,8 @@ class BSEQResult(BenchmarkResult):
3839
largest_connected_size: int
3940
fraction_connected: float = Field(...)
4041

41-
def compute_score(self) -> float | None:
42-
return self.largest_connected_size
42+
def compute_score(self) -> BenchmarkScore:
43+
return BenchmarkScore(value=float(self.largest_connected_size))
4344

4445

4546
@dataclass

metriq_gym/benchmarks/clops.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
Benchmark,
1313
BenchmarkData,
1414
BenchmarkResult,
15+
BenchmarkScore,
1516
)
1617
from metriq_gym.qplatform.job import execution_time
1718
from metriq_gym.qplatform.device import connectivity_graph
@@ -29,8 +30,8 @@ class ClopsData(BenchmarkData):
2930
class ClopsResult(BenchmarkResult):
3031
clops_score: float = Field(...)
3132

32-
def compute_score(self) -> float | None:
33-
return self.clops_score
33+
def compute_score(self) -> BenchmarkScore:
34+
return BenchmarkScore(value=self.clops_score)
3435

3536

3637
# adapted from submodules/qiskit-device-benchmarking/qiskit_device_benchmarking/clops/clops_benchmark.py::create_qubit_map

metriq_gym/benchmarks/lr_qaoa.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import math
22
import statistics
33
import networkx as nx
4+
import numpy as np
45
import rustworkx as rx
56
from scipy import stats
67
from dataclasses import dataclass
@@ -18,6 +19,7 @@
1819
Benchmark,
1920
BenchmarkData,
2021
BenchmarkResult,
22+
BenchmarkScore,
2123
)
2224
from metriq_gym.helpers.task_helpers import flatten_counts
2325
from metriq_gym.qplatform.device import connectivity_graph
@@ -168,9 +170,12 @@ class LinearRampQAOAResult(BenchmarkResult):
168170
approx_ratio: list[float]
169171
random_approx_ratio: float = Field(...)
170172
confidence_pass: list[bool]
173+
effective_approx_ratio: list[float] | None = None
171174

172-
def compute_score(self) -> float | None:
173-
return self.approx_ratio[0]
175+
def compute_score(self) -> BenchmarkScore:
176+
if not self.effective_approx_ratio:
177+
raise ValueError("effective_approx_ratio must be populated to compute score.")
178+
return BenchmarkScore(value=float(np.mean(self.effective_approx_ratio)))
174179

175180

176181
def prepare_qaoa_circuit(
@@ -232,6 +237,7 @@ class AggregateStats:
232237
optimal_probability: list[float]
233238
approx_ratio: list[float]
234239
confidence_pass: list[bool]
240+
effective_approx_ratio: list[float]
235241

236242

237243
def calc_trial_stats(
@@ -317,13 +323,18 @@ def calc_stats(data: LinearRampQAOAData, samples: list["MeasCount"]) -> Aggregat
317323
all(stat[ith_layer].confidence_pass for stat in trial_stats)
318324
for ith_layer in range(len(data.qaoa_layers))
319325
]
326+
effective_approx_ratio = [
327+
(r - data.approx_ratio_random_mean) / (1 - data.approx_ratio_random_mean)
328+
for r in approx_ratio
329+
]
320330

321331
return AggregateStats(
322332
trial_stats=trial_stats,
323333
trials=num_trials,
324334
approx_ratio=approx_ratio,
325335
optimal_probability=optimal_probability,
326336
confidence_pass=confidence_pass,
337+
effective_approx_ratio=effective_approx_ratio,
327338
)
328339

329340

@@ -403,7 +414,9 @@ def _build_circuits(
403414
return circuits_with_params, graph_info, optimal_sol, circuit_encoding
404415

405416
def dispatch_handler(self, device: "QuantumDevice") -> LinearRampQAOAData:
406-
circuits_with_params, graph_info, optimal_sol, circuit_encoding = self._build_circuits(device)
417+
circuits_with_params, graph_info, optimal_sol, circuit_encoding = self._build_circuits(
418+
device
419+
)
407420

408421
approx_ratio_random_mean, approx_ratio_random_std = calc_random_stats(
409422
self.params.num_qubits,
@@ -443,6 +456,7 @@ def poll_handler(
443456
approx_ratio=stats.approx_ratio,
444457
random_approx_ratio=job_data.approx_ratio_random_mean,
445458
confidence_pass=stats.confidence_pass,
459+
effective_approx_ratio=stats.effective_approx_ratio,
446460
)
447461

448462
def estimate_resources_handler(

metriq_gym/benchmarks/mirror_circuits.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,8 @@ class MirrorCircuitsResult(BenchmarkResult):
4848
polarization: BenchmarkScore = Field(...)
4949
binary_success: bool
5050

51-
def compute_score(self) -> float | None:
52-
return self.values.get("polarization")
51+
def compute_score(self) -> BenchmarkScore:
52+
return self.polarization
5353

5454

5555
@dataclass

metriq_gym/benchmarks/qedc_benchmarks.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -92,9 +92,6 @@ class QEDCResult(BenchmarkResult):
9292

9393
circuit_metrics: QEDC_Metrics
9494

95-
def compute_score(self) -> float | None:
96-
return None
97-
9895

9996
def import_benchmark_module(benchmark_name: str) -> ModuleType:
10097
"""

metriq_gym/benchmarks/qml_kernel.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@ class QMLKernelData(BenchmarkData):
2828
class QMLKernelResult(BenchmarkResult):
2929
accuracy_score: BenchmarkScore
3030

31-
def compute_score(self) -> float | None:
32-
return self.accuracy_score.value
31+
def compute_score(self) -> BenchmarkScore:
32+
return self.accuracy_score
3333

3434

3535
def ZZfeature_circuit(num_qubits: int) -> QuantumCircuit:

metriq_gym/benchmarks/quantum_volume.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,6 @@ class QuantumVolumeResult(BenchmarkResult):
4040
p_value: float
4141
trials: int
4242

43-
def compute_score(self) -> float | None:
44-
return None
45-
4643

4744
def prepare_qv_circuits(n: int, num_trials: int) -> tuple[list[QuantumCircuit], list[list[float]]]:
4845
circuits = []

metriq_gym/benchmarks/wit.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -229,8 +229,8 @@ def wit_circuit(num_qubits: int) -> QuantumCircuit:
229229
class WITResult(BenchmarkResult):
230230
expectation_value: BenchmarkScore
231231

232-
def compute_score(self) -> float | None:
233-
return self.expectation_value.value
232+
def compute_score(self) -> BenchmarkScore:
233+
return self.expectation_value
234234

235235

236236
@dataclass

metriq_gym/exporters/base_exporter.py

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -22,16 +22,11 @@ def _derive_device_metadata(self) -> dict[str, Any]:
2222

2323
def as_dict(self):
2424
# Preserve existing top-level fields.
25-
results_block = {
26-
"values": self.result.values,
27-
"uncertainties": self.result.uncertainties if self.result.uncertainties else {},
28-
}
29-
# For single-job dispatches, also include the benchmark-declared score metric
30-
# Include only the declared score metric (no implicit inference)
31-
score_val = getattr(self.result, "score", None)
32-
if score_val is not None:
33-
results_block["score"] = score_val
34-
25+
# For uploads/exports, include the full result payload (already contains score)
26+
results_block = self.result.model_dump()
27+
if results_block.get("score") is None:
28+
results_block.pop("score", None)
29+
# Do not emit a separate uncertainties block; structured fields carry their own
3530
record = {
3631
"app_version": self.metriq_gym_job.app_version,
3732
"timestamp": self.metriq_gym_job.dispatch_time.isoformat(),

0 commit comments

Comments
 (0)