automl
diff --git a/‎src/mfpbench/__main__.py
Lines changed: 10 additions & 0 deletions b/‎src/mfpbench/__main__.py
Lines changed: 10 additions & 0 deletions
diff --git a/‎src/mfpbench/benchmark.py
Lines changed: 37 additions & 5 deletions b/‎src/mfpbench/benchmark.py
Lines changed: 37 additions & 5 deletions
diff --git a/‎src/mfpbench/get.py
Lines changed: 10 additions & 3 deletions b/‎src/mfpbench/get.py
Lines changed: 10 additions & 3 deletions
diff --git a/‎src/mfpbench/jahs/benchmark.py
Lines changed: 1 addition & 0 deletions b/‎src/mfpbench/jahs/benchmark.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/mfpbench/lcbench_tabular/benchmark.py
Lines changed: 5 additions & 2 deletions b/‎src/mfpbench/lcbench_tabular/benchmark.py
Lines changed: 5 additions & 2 deletions
diff --git a/‎src/mfpbench/metric.py
Lines changed: 3 additions & 0 deletions b/‎src/mfpbench/metric.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/mfpbench/nb201_tabular/__init__.py b/‎src/mfpbench/nb201_tabular/__init__.py
@@ -129,6 +129,7 @@ def do(cls, args: argparse.Namespace) -> None:
             download=True,
             install=False,
             force=args.force,
+            workers=args.workers,
         )
 
     @override
@@ -149,6 +150,15 @@ def fill_parser(cls, parser: argparse.ArgumentParser) -> argparse.ArgumentParser
             action="store_true",
             help="Print out the available benchmarks data sources",
         )
+        parser.add_argument(
+            "--workers",
+            type=int,
+            default=1,
+            help=(
+                "The number of workers to use for downloading"
+                " if the downlaoder supports it"
+            ),
+        )
         parser.add_argument(
             "--benchmark",
             choices=[
 
@@ -73,6 +73,7 @@ def __init__(  # noqa: PLR0913
         prior: str | Path | C | Mapping[str, Any] | None = None,
         perturb_prior: float | None = None,
         value_metric: str | None = None,
+        value_metric_test: str | None = None,
         cost_metric: str | None = None,
     ):
         """Initialize the benchmark.
@@ -97,19 +98,30 @@ def __init__(  # noqa: PLR0913
                 as the probability of swapping the value for a random one.
             value_metric: The metric to use for this benchmark. Uses
                 the default metric from the Result if None.
+            value_metric_test: The metric to use as a test metric for this benchmark.
+                Uses the default test metric from the Result if left as None, and
+                if there is no default test metric, will return None.
             cost_metric: The cost to use for this benchmark. Uses
                 the default cost from the Result if None.
         """
         if value_metric is None:
             value_metric = result_type.default_value_metric
+        if value_metric_test is None:
+            value_metric_test = result_type.default_value_metric_test
 
         if cost_metric is None:
             cost_metric = result_type.default_cost_metric
 
+        # Ensure that the result type actually has an atrribute called value_metric
+        if value_metric is None:
+            assert getattr(self.Result, "value_metric", None) is not None
+            value_metric = self.Result.value_metric
+
         self.name = name
         self.seed = seed
         self.space = space
         self.value_metric = value_metric
+        self.value_metric_test: str | None = value_metric_test
         self.cost_metric = cost_metric
         self.fidelity_range: tuple[F, F, F] = fidelity_range
         self.fidelity_name = fidelity_name
@@ -121,10 +133,6 @@ def __init__(  # noqa: PLR0913
             for metric_name, metric in self.Result.metric_defs.items()
         }
 
-        if value_metric is None:
-            assert getattr(self.Result, "value_metric", None) is not None
-            value_metric = self.Result.value_metric
-
         self._prior_arg = prior
 
         # NOTE: This is handled entirely by subclasses as it requires knowledge
@@ -250,6 +258,7 @@ def query(
         *,
         at: F | None = None,
         value_metric: str | None = None,
+        value_metric_test: str | None = None,
         cost_metric: str | None = None,
     ) -> R:
         """Submit a query and get a result.
@@ -260,11 +269,17 @@ def query(
             value_metric: The metric to use for this result. Uses
                 the value metric passed in to the constructor if not specified,
                 otherwise the default metric from the Result if None.
+            value_metric: The metric to use for this result. Uses
+                the value metric passed in to the constructor if not specified,
+                otherwise the default metric from the Result if None.
+            value_metric_test: The metric to use for this result. Uses
+                the value metric passed in to the constructor if not specified,
+                otherwise the default metric from the Result if None. If that
+                is still None, then the `value_metric_test` will be None as well.
             cost_metric: The metric to use for this result. Uses
                 the cost metric passed in to the constructor if not specified,
                 otherwise the default metric from the Result if None.
 
-
         Returns:
             The result of the query
         """
@@ -282,13 +297,19 @@ def query(
             __config = {k: __config.get(v, v) for k, v in _reverse_renames.items()}
 
         value_metric = value_metric if value_metric is not None else self.value_metric
+        value_metric_test = (
+            value_metric_test
+            if value_metric_test is not None
+            else self.value_metric_test
+        )
         cost_metric = cost_metric if cost_metric is not None else self.cost_metric
 
         return self.Result.from_dict(
             config=config,
             fidelity=at,
             result=self._objective_function(__config, at=at),
             value_metric=str(value_metric),
+            value_metric_test=value_metric_test,
             cost_metric=str(cost_metric),
             renames=self._result_renames,
         )
@@ -301,6 +322,7 @@ def trajectory(
         to: F | None = None,
         step: F | None = None,
         value_metric: str | None = None,
+        value_metric_test: str | None = None,
         cost_metric: str | None = None,
     ) -> list[R]:
         """Get the full trajectory of a configuration.
@@ -313,6 +335,10 @@ def trajectory(
             value_metric: The metric to use for this result. Uses
                 the value metric passed in to the constructor if not specified,
                 otherwise the default metric from the Result if None.
+            value_metric_test: The metric to use for this result. Uses
+                the value metric passed in to the constructor if not specified,
+                otherwise the default metric from the Result if None. If that
+                is still None, then the `value_metric_test` will be None as well.
             cost_metric: The metric to use for this result. Uses
                 the cost metric passed in to the constructor if not specified,
                 otherwise the default metric from the Result if None.
@@ -330,6 +356,11 @@ def trajectory(
             __config = {k: __config.get(v, v) for k, v in _reverse_renames.items()}
 
         value_metric = value_metric if value_metric is not None else self.value_metric
+        value_metric_test = (
+            value_metric_test
+            if value_metric_test is not None
+            else self.value_metric_test
+        )
         cost_metric = cost_metric if cost_metric is not None else self.cost_metric
 
         return [
@@ -338,6 +369,7 @@ def trajectory(
                 fidelity=fidelity,
                 result=result,
                 value_metric=str(value_metric),
+                value_metric_test=value_metric_test,
                 cost_metric=str(cost_metric),
                 renames=self._result_renames,
             )
 
@@ -4,16 +4,16 @@
 from typing import TYPE_CHECKING, Any
 
 from mfpbench.jahs import JAHSBenchmark
-from mfpbench.lcbench_tabular import (
-    LCBenchTabularBenchmark,
-)
+from mfpbench.lcbench_tabular import LCBenchTabularBenchmark
+from mfpbench.nb201_tabular.benchmark import NB201TabularBenchmark
 from mfpbench.pd1 import (
     PD1cifar100_wideresnet_2048,
     PD1imagenet_resnet_512,
     PD1lm1b_transformer_2048,
     PD1translatewmt_xformer_64,
     PD1uniref50_transformer_128,
 )
+from mfpbench.pd1_tabular import PD1TabularBenchmark
 from mfpbench.synthetic.hartmann import (
     MFHartmann3Benchmark,
     MFHartmann3BenchmarkBad,
@@ -26,6 +26,7 @@
     MFHartmann6BenchmarkModerate,
     MFHartmann6BenchmarkTerrible,
 )
+from mfpbench.taskset_tabular import TaskSetTabularBenchmark
 from mfpbench.yahpo import (
     IAMLglmnetBenchmark,
     IAMLrangerBenchmark,
@@ -84,6 +85,12 @@
     "imagenet_resnet_512": PD1imagenet_resnet_512,
     # LCBenchTabular
     "lcbench_tabular": LCBenchTabularBenchmark,
+    # PD1Tabular
+    "pd1_tabular": PD1TabularBenchmark,
+    # TaskSetTabular
+    "taskset_tabular": TaskSetTabularBenchmark,
+    # nb201 tabular
+    "nb201_tabular": NB201TabularBenchmark,
 }
 
 
 
@@ -56,6 +56,7 @@ class JAHSConfig(Config):
 class JAHSResult(Result[JAHSConfig, int]):
     default_value_metric: ClassVar[str] = "valid_acc"
     default_cost_metric: ClassVar[str] = "runtime"
+    default_value_metric_test: ClassVar[str] = "test_acc"
 
     metric_defs: ClassVar[Mapping[str, Metric]] = {
         "runtime": Metric(minimize=True, bounds=(0, np.inf)),
 
@@ -140,14 +140,15 @@ class LCBenchTabularConfig(TabularConfig):
 class LCBenchTabularResult(Result[LCBenchTabularConfig, int]):
     metric_defs: ClassVar[Mapping[str, Metric]] = {
         "val_accuracy": Metric(minimize=False, bounds=(0, 100)),
-        "val_balanced_accuracy": Metric(minimize=False, bounds=(0, 100)),
+        "val_balanced_accuracy": Metric(minimize=False, bounds=(0, 1)),
         "val_cross_entropy": Metric(minimize=True, bounds=(0, np.inf)),
         "test_accuracy": Metric(minimize=False, bounds=(0, 100)),
-        "test_balanced_accuracy": Metric(minimize=False, bounds=(0, 100)),
+        "test_balanced_accuracy": Metric(minimize=False, bounds=(0, 1)),
         "test_cross_entropy": Metric(minimize=True, bounds=(0, np.inf)),
         "time": Metric(minimize=True, bounds=(0, np.inf)),
     }
     default_value_metric: ClassVar[str] = "val_balanced_accuracy"
+    default_value_metric_test: ClassVar[str] = "test_balanced_accuracy"
     default_cost_metric: ClassVar[str] = "time"
 
     time: Metric.Value
@@ -214,6 +215,7 @@ def __init__(
         prior: str | Path | LCBenchTabularConfig | Mapping[str, Any] | None = None,
         perturb_prior: float | None = None,
         value_metric: str | None = None,
+        value_metric_test: str | None = None,
         cost_metric: str | None = None,
     ) -> None:
         """Initialize the benchmark.
@@ -282,6 +284,7 @@ def __init__(
             result_type=LCBenchTabularResult,
             config_type=LCBenchTabularConfig,
             value_metric=value_metric,
+            value_metric_test=value_metric_test,
             cost_metric=cost_metric,
             space=space,
             seed=seed,
 
@@ -3,6 +3,7 @@
 from dataclasses import dataclass, field
 
 import numpy as np
+import pandas as pd
 
 
 class OutOfBoundsError(ValueError):
@@ -38,6 +39,8 @@ def as_value(self, value: float) -> Metric.Value:
         Returns:
             The metric value.
         """
+        if pd.isna(value):
+            value = np.inf
         return Metric.Value(value=value, definition=self)
 
     @property