Better result reporting (#655)

cosenal · web-flow · commit 6d5a875adecf · 2025-12-11T20:33:22.000+01:00
* Make local db more resilient to bad records

* warning about volatile db in docs

* test new job manager behavior

* address copilot comments

* Better result reporting

* better results in upload

* one unnecessary elif
diff --git a/metriq_gym/benchmarks/benchmark.py b/metriq_gym/benchmarks/benchmark.py
@@ -1,6 +1,6 @@
 import argparse
 from typing import Iterable, TYPE_CHECKING, Protocol
-from abc import ABC, abstractmethod
+from abc import ABC
 
 from pydantic import BaseModel, computed_field
 from dataclasses import dataclass
@@ -39,6 +39,14 @@ class BenchmarkScore(BaseModel):
     # If not specified, treat uncertainty as not available (N/A)
     uncertainty: float | None = None
 
+    def __str__(self) -> str:
+        if self.uncertainty is None or self.uncertainty == "":
+            return str(self.value)
+        return f"{self.value} ± {self.uncertainty}"
+
+    def __repr__(self) -> str:
+        return f"BenchmarkScore(value={self.value}, uncertainty={self.uncertainty})"
+
 
 class BenchmarkResult(BaseModel, ABC):
     """Base class for benchmark results.
@@ -50,9 +58,10 @@ class BenchmarkResult(BaseModel, ABC):
 
     def _iter_metric_items(self):
         for name in self.__class__.model_fields:
+            if name in self.__class__.model_computed_fields:
+                continue
             value = getattr(self, name, None)
             if isinstance(value, BenchmarkScore):
-                # If uncertainty is not provided, leave as None
                 u = value.uncertainty
                 yield name, float(value.value), (float(u) if u is not None else None)
             elif isinstance(value, (int, float)):
@@ -67,17 +76,16 @@ def values(self) -> dict[str, float]:
     def uncertainties(self) -> dict[str, float | None]:
         return {name: uncertainty for name, _, uncertainty in self._iter_metric_items()}
 
-    @abstractmethod
-    def compute_score(self) -> float | None:
+    def compute_score(self) -> BenchmarkScore | None:
         """Hook for computing a scalar score from result metrics.
 
         Default implementation returns None. Benchmarks should override this to
         implement single- or multi-metric scoring as appropriate.
         """
-        ...
+        return None
 
-    @computed_field(return_type=float | None)
-    def score(self) -> float | None:
+    @computed_field(return_type=BenchmarkScore | None)
+    def score(self) -> BenchmarkScore | None:
         return self.compute_score()
 
 
diff --git a/metriq_gym/benchmarks/bseq.py b/metriq_gym/benchmarks/bseq.py
@@ -19,6 +19,7 @@
     Benchmark,
     BenchmarkData,
     BenchmarkResult,
+    BenchmarkScore,
 )
 from metriq_gym.helpers.task_helpers import flatten_counts
 from metriq_gym.helpers.graph_helpers import (
@@ -38,8 +39,8 @@ class BSEQResult(BenchmarkResult):
     largest_connected_size: int
     fraction_connected: float = Field(...)
 
-    def compute_score(self) -> float | None:
-        return self.largest_connected_size
+    def compute_score(self) -> BenchmarkScore:
+        return BenchmarkScore(value=float(self.largest_connected_size))
 
 
 @dataclass
diff --git a/metriq_gym/benchmarks/clops.py b/metriq_gym/benchmarks/clops.py
@@ -12,6 +12,7 @@
     Benchmark,
     BenchmarkData,
     BenchmarkResult,
+    BenchmarkScore,
 )
 from metriq_gym.qplatform.job import execution_time
 from metriq_gym.qplatform.device import connectivity_graph
@@ -29,8 +30,8 @@ class ClopsData(BenchmarkData):
 class ClopsResult(BenchmarkResult):
     clops_score: float = Field(...)
 
-    def compute_score(self) -> float | None:
-        return self.clops_score
+    def compute_score(self) -> BenchmarkScore:
+        return BenchmarkScore(value=self.clops_score)
 
 
 # adapted from submodules/qiskit-device-benchmarking/qiskit_device_benchmarking/clops/clops_benchmark.py::create_qubit_map
diff --git a/metriq_gym/benchmarks/lr_qaoa.py b/metriq_gym/benchmarks/lr_qaoa.py
@@ -1,6 +1,7 @@
 import math
 import statistics
 import networkx as nx
+import numpy as np
 import rustworkx as rx
 from scipy import stats
 from dataclasses import dataclass
@@ -18,6 +19,7 @@
     Benchmark,
     BenchmarkData,
     BenchmarkResult,
+    BenchmarkScore,
 )
 from metriq_gym.helpers.task_helpers import flatten_counts
 from metriq_gym.qplatform.device import connectivity_graph
@@ -168,9 +170,12 @@ class LinearRampQAOAResult(BenchmarkResult):
     approx_ratio: list[float]
     random_approx_ratio: float = Field(...)
     confidence_pass: list[bool]
+    effective_approx_ratio: list[float] | None = None
 
-    def compute_score(self) -> float | None:
-        return self.approx_ratio[0]
+    def compute_score(self) -> BenchmarkScore:
+        if not self.effective_approx_ratio:
+            raise ValueError("effective_approx_ratio must be populated to compute score.")
+        return BenchmarkScore(value=float(np.mean(self.effective_approx_ratio)))
 
 
 def prepare_qaoa_circuit(
@@ -232,6 +237,7 @@ class AggregateStats:
     optimal_probability: list[float]
     approx_ratio: list[float]
     confidence_pass: list[bool]
+    effective_approx_ratio: list[float]
 
 
 def calc_trial_stats(
@@ -317,13 +323,18 @@ def calc_stats(data: LinearRampQAOAData, samples: list["MeasCount"]) -> Aggregat
         all(stat[ith_layer].confidence_pass for stat in trial_stats)
         for ith_layer in range(len(data.qaoa_layers))
     ]
+    effective_approx_ratio = [
+        (r - data.approx_ratio_random_mean) / (1 - data.approx_ratio_random_mean)
+        for r in approx_ratio
+    ]
 
     return AggregateStats(
         trial_stats=trial_stats,
         trials=num_trials,
         approx_ratio=approx_ratio,
         optimal_probability=optimal_probability,
         confidence_pass=confidence_pass,
+        effective_approx_ratio=effective_approx_ratio,
     )
 
 
@@ -403,7 +414,9 @@ def _build_circuits(
         return circuits_with_params, graph_info, optimal_sol, circuit_encoding
 
     def dispatch_handler(self, device: "QuantumDevice") -> LinearRampQAOAData:
-        circuits_with_params, graph_info, optimal_sol, circuit_encoding = self._build_circuits(device)
+        circuits_with_params, graph_info, optimal_sol, circuit_encoding = self._build_circuits(
+            device
+        )
 
         approx_ratio_random_mean, approx_ratio_random_std = calc_random_stats(
             self.params.num_qubits,
@@ -443,6 +456,7 @@ def poll_handler(
             approx_ratio=stats.approx_ratio,
             random_approx_ratio=job_data.approx_ratio_random_mean,
             confidence_pass=stats.confidence_pass,
+            effective_approx_ratio=stats.effective_approx_ratio,
         )
 
     def estimate_resources_handler(
diff --git a/metriq_gym/benchmarks/mirror_circuits.py b/metriq_gym/benchmarks/mirror_circuits.py
@@ -48,8 +48,8 @@ class MirrorCircuitsResult(BenchmarkResult):
     polarization: BenchmarkScore = Field(...)
     binary_success: bool
 
-    def compute_score(self) -> float | None:
-        return self.values.get("polarization")
+    def compute_score(self) -> BenchmarkScore:
+        return self.polarization
 
 
 @dataclass
diff --git a/metriq_gym/benchmarks/qedc_benchmarks.py b/metriq_gym/benchmarks/qedc_benchmarks.py
@@ -92,9 +92,6 @@ class QEDCResult(BenchmarkResult):
 
     circuit_metrics: QEDC_Metrics
 
-    def compute_score(self) -> float | None:
-        return None
-
 
 def import_benchmark_module(benchmark_name: str) -> ModuleType:
     """
diff --git a/metriq_gym/benchmarks/qml_kernel.py b/metriq_gym/benchmarks/qml_kernel.py
@@ -28,8 +28,8 @@ class QMLKernelData(BenchmarkData):
 class QMLKernelResult(BenchmarkResult):
     accuracy_score: BenchmarkScore
 
-    def compute_score(self) -> float | None:
-        return self.accuracy_score.value
+    def compute_score(self) -> BenchmarkScore:
+        return self.accuracy_score
 
 
 def ZZfeature_circuit(num_qubits: int) -> QuantumCircuit:
diff --git a/metriq_gym/benchmarks/quantum_volume.py b/metriq_gym/benchmarks/quantum_volume.py
@@ -40,9 +40,6 @@ class QuantumVolumeResult(BenchmarkResult):
     p_value: float
     trials: int
 
-    def compute_score(self) -> float | None:
-        return None
-
 
 def prepare_qv_circuits(n: int, num_trials: int) -> tuple[list[QuantumCircuit], list[list[float]]]:
     circuits = []
diff --git a/metriq_gym/benchmarks/wit.py b/metriq_gym/benchmarks/wit.py
@@ -229,8 +229,8 @@ def wit_circuit(num_qubits: int) -> QuantumCircuit:
 class WITResult(BenchmarkResult):
     expectation_value: BenchmarkScore
 
-    def compute_score(self) -> float | None:
-        return self.expectation_value.value
+    def compute_score(self) -> BenchmarkScore:
+        return self.expectation_value
 
 
 @dataclass
diff --git a/metriq_gym/exporters/base_exporter.py b/metriq_gym/exporters/base_exporter.py
@@ -22,16 +22,11 @@ def _derive_device_metadata(self) -> dict[str, Any]:
 
     def as_dict(self):
         # Preserve existing top-level fields.
-        results_block = {
-            "values": self.result.values,
-            "uncertainties": self.result.uncertainties if self.result.uncertainties else {},
-        }
-        # For single-job dispatches, also include the benchmark-declared score metric
-        # Include only the declared score metric (no implicit inference)
-        score_val = getattr(self.result, "score", None)
-        if score_val is not None:
-            results_block["score"] = score_val
-
+        # For uploads/exports, include the full result payload (already contains score)
+        results_block = self.result.model_dump()
+        if results_block.get("score") is None:
+            results_block.pop("score", None)
+        # Do not emit a separate uncertainties block; structured fields carry their own
         record = {
             "app_version": self.metriq_gym_job.app_version,
             "timestamp": self.metriq_gym_job.dispatch_time.isoformat(),
diff --git a/metriq_gym/exporters/cli_exporter.py b/metriq_gym/exporters/cli_exporter.py
@@ -1,25 +1,36 @@
 from pprint import pprint
 
+from metriq_gym.benchmarks.benchmark import BenchmarkScore
 from metriq_gym.exporters.base_exporter import BaseExporter
 
 
 class CliExporter(BaseExporter):
     def export(self) -> None:
-        # Print metadata first (without the nested results block)
+        # Print metadata (keep results block separate for clarity below)
         record = self.as_dict()
-        record.pop("results", None)
         pprint(record)
 
-        # Print results using the model's computed properties
-        values = self.result.values
-        uncertainties = self.result.uncertainties
-        if values:
-            print("\nResults:")
-            for key in sorted(values):
-                value = values[key]
-                # Preserve explicit 0.0; treat None/empty as missing
-                uncertainty = uncertainties.get(key)
-                if uncertainty is None or uncertainty == "":
-                    print(f"  {key}: {value}")
+        # Surface the full result object, formatting uncertainties inline when available
+        print("\nResults:")
+        payload = self.result.model_dump()
+        result_uncertainties = self.result.uncertainties or {}
+        score_val = getattr(self.result, "score", None)
+        if isinstance(score_val, BenchmarkScore):
+            payload["score"] = score_val.model_dump()
+            result_uncertainties = {**result_uncertainties, "score": score_val.uncertainty}
+
+        for key in sorted(payload.keys()):
+            if key == "uncertainties":
+                continue
+            value = payload[key]
+            # If payload already carries value/uncertainty (e.g., BenchmarkScore), format inline
+            if isinstance(value, dict) and "value" in value and "uncertainty" in value:
+                if value["uncertainty"] is None or value["uncertainty"] == "":
+                    print(f"  {key}: {value['value']}")
                 else:
-                    print(f"  {key}: {value} ± {uncertainty}")
+                    print(f"  {key}: {value['value']} ± {value['uncertainty']}")
+            # Otherwise, if we have a separate uncertainty entry, format it
+            elif key in result_uncertainties and result_uncertainties[key] is not None:
+                print(f"  {key}: {value} ± {result_uncertainties[key]}")
+            else:
+                print(f"  {key}: {value}")
diff --git a/tests/e2e/test_dispatch_poll_job.py b/tests/e2e/test_dispatch_poll_job.py
@@ -10,7 +10,8 @@
 
 @pytest.fixture(autouse=True)
 def store_env(monkeypatch, tmp_path):
-    monkeypatch.setenv("MGR_LOCAL_JOB_DIR", str(tmp_path))
+    monkeypatch.setenv("MGYM_LOCAL_DB_DIR", str(tmp_path))
+    monkeypatch.setenv("MGYM_LOCAL_SIMULATOR_CACHE_DIR", str(tmp_path))
 
 
 @pytest.mark.e2e
@@ -68,7 +69,7 @@ def test_dispatch_and_poll_single_job_on_local_simulator(tmp_path):
     assert outfile.exists(), "Poll did not write the expected JSON file"
 
     data = json.loads(outfile.read_text())
-    result = data["results"]["values"]["accuracy_score"]
+    result = data["results"]["accuracy_score"]["value"]
     assert result, "No results found in the JSON file"
     assert result == 1.0, "Expected accuracy score of 1.0 for the local simulator"
 
diff --git a/tests/e2e/test_dispatch_poll_suite.py b/tests/e2e/test_dispatch_poll_suite.py
@@ -7,7 +7,8 @@
 
 @pytest.fixture(autouse=True)
 def store_env(monkeypatch, tmp_path):
-    monkeypatch.setenv("MGR_LOCAL_JOB_DIR", str(tmp_path))
+    monkeypatch.setenv("MGYM_LOCAL_DB_DIR", str(tmp_path))
+    monkeypatch.setenv("MGYM_LOCAL_SIMULATOR_CACHE_DIR", str(tmp_path))
 
 
 @pytest.mark.e2e
diff --git a/tests/unit/benchmarks/test_benchmark.py b/tests/unit/benchmarks/test_benchmark.py
@@ -48,22 +48,22 @@ class R(BenchmarkResult):
             numeric_value: float
 
             def compute_score(self):
-                return None
+                return BenchmarkScore(value=self.numeric_value, uncertainty=None)
 
         # Should not raise
         r = R(numeric_value=1.23)
         assert r.values["numeric_value"] == pytest.approx(1.23)
 
-    def test_no_direction_required_for_benchmarkscore_metric(self):
+    def test_benchmarkscore_metric_allowed(self):
         class R(BenchmarkResult):
             metric: BenchmarkScore
 
             def compute_score(self):
-                return None
+                return self.metric
 
-        # Should not raise
         r = R(metric=BenchmarkScore(value=0.5, uncertainty=0.1))
         assert r.values["metric"] == pytest.approx(0.5)
+        assert r.uncertainties["metric"] == pytest.approx(0.1)
 
     def test_bool_metric_allowed(self):
         class R(BenchmarkResult):
@@ -78,11 +78,11 @@ def compute_score(self):
     def test_fields_with_metadata_are_ok(self):
         class R(BenchmarkResult):
             accuracy: float = Field(...)
-            latency: BenchmarkScore = Field(...)
+            latency: float = Field(...)
 
             def compute_score(self):
-                return None
+                return BenchmarkScore(value=self.accuracy, uncertainty=None)
 
-        r = R(accuracy=0.99, latency=BenchmarkScore(value=12.0, uncertainty=0.5))
+        r = R(accuracy=0.99, latency=12.0)
         assert r.values["accuracy"] == pytest.approx(0.99)
-        assert r.uncertainties["latency"] == pytest.approx(0.5)
+        assert "latency" in r.values
diff --git a/tests/unit/benchmarks/test_benchmark_result.py b/tests/unit/benchmarks/test_benchmark_result.py
@@ -18,25 +18,18 @@ def test_payload_includes_null_uncertainty_for_numeric_and_benchmarkscore(metriq
     class _NumResult(BenchmarkResult):
         numeric_metric: float = Field(...)
 
-        def compute_score(self):
-            return None
-
     num_result = _NumResult(numeric_metric=1.23)
     num_payload = _DummyExporter(metriq_job, num_result).as_dict()
 
-    assert num_payload["results"]["values"]["numeric_metric"] == pytest.approx(1.23)
-    assert "numeric_metric" in num_payload["results"]["uncertainties"]
-    assert num_payload["results"]["uncertainties"]["numeric_metric"] is None
+    assert num_payload["results"]["numeric_metric"] == pytest.approx(1.23)
+    assert "score" not in num_payload["results"]
 
     # BenchmarkScore without uncertainty should also be None
     class _WitLikeResult(BenchmarkResult):
         expectation_value: BenchmarkScore = Field(...)
 
-        def compute_score(self):
-            return None
-
     bs_result = _WitLikeResult(expectation_value=BenchmarkScore(value=0.7))
     bs_payload = _DummyExporter(metriq_job, bs_result).as_dict()
 
-    assert bs_payload["results"]["values"]["expectation_value"] == pytest.approx(0.7)
-    assert bs_payload["results"]["uncertainties"]["expectation_value"] is None
+    assert bs_payload["results"]["expectation_value"]["value"] == pytest.approx(0.7)
+    assert bs_payload["results"]["expectation_value"]["uncertainty"] is None
diff --git a/tests/unit/benchmarks/test_wit.py b/tests/unit/benchmarks/test_wit.py
diff --git a/tests/unit/exporters/test_json_exporter.py b/tests/unit/exporters/test_json_exporter.py
diff --git a/tests/unit/test_job_manager.py b/tests/unit/test_job_manager.py
diff --git a/tests/unit/test_run.py b/tests/unit/test_run.py
diff --git a/tests/unit/test_upload_paths.py b/tests/unit/test_upload_paths.py