CentML
diff --git a/‎multimodal/vl2l/pyproject.toml‎
Lines changed: 1 addition & 0 deletions b/‎multimodal/vl2l/pyproject.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/cli.py‎
Lines changed: 50 additions & 52 deletions b/‎multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/cli.py‎
Lines changed: 50 additions & 52 deletions
@@ -21,6 +21,7 @@ dependencies = [
   "typer",
   "scikit-learn",
   "tabulate",
+  "hiclass",
 ]
 dynamic = ["version"]
 
 
@@ -6,15 +6,15 @@
 from datetime import timedelta
 from enum import StrEnum, auto
 from pathlib import Path
-from typing import Annotated
+from typing import Annotated, Literal
 
 import mlperf_loadgen as lg
 from loguru import logger
-from pydantic import BaseModel, DirectoryPath, Field, field_validator
+from pydantic import BaseModel, DirectoryPath, Field, FilePath, field_validator
 from pydantic_typer import Typer
 from typer import Option
 
-from .evaluation import Evaluator
+from .evaluation import run_evaluation
 from .task import ShopifyGlobalCatalogue
 
 app = Typer()
@@ -179,7 +179,9 @@ class TestSettings(BaseModel):
         int,
         Field(
             description="""The minimum testing query count.
-            The benchmark runs until this value has been met.""",
+            The benchmark runs until this value has been met.
+            if min_query_count is less than the total number of samples in the dataset,
+            only the first min_query_count samples will be used during testing.""",
         ),
     ] = 100
 
@@ -348,22 +350,6 @@ def to_lgtype(self) -> tuple[lg.TestSettings, lg.LogSettings]:
         log_settings = self.logging.to_lgtype()
         return (test_settings, log_settings)
 
-class Evaluation(BaseModel):
-    """Evaluate the results of the accuracy scenario."""
-    enable_evaluation: Annotated[
-        bool,
-        Field(
-            description="Evaluate the results of the accuracy scenario.",
-        ),
-    ] = False
-
-    filename: Annotated[
-        Path,
-        Field(
-            description="Location of the accuracy file.",
-        ),
-    ] = Path("./output/mlperf_log_accuracy.json")
-
 
 class Model(BaseModel):
     """Specifies the model to use for the VL2L benchmark."""
@@ -391,6 +377,11 @@ class Dataset(BaseModel):
         ),
     ] = None
 
+    split: Annotated[
+        Literal["train","test"],
+        Field(description="choose between train or test split"),
+    ] = "train"
+
 
 class Verbosity(StrEnum):
     """The verbosity level of the logger."""
@@ -422,15 +413,28 @@ class Endpoint(BaseModel):
         Field(description="The API key to authenticate the inference requests."),
     ] = ""
 
+@app.command()
+def evaluate(
+    filename: Annotated[
+        FilePath,
+        Option(
+            help="Location of the accuracy file.",
+        ),
+    ],
+    dataset: Dataset,
+) -> None:
+    """Evaluate the accuracy of the VLM responses."""
+    logger.info("Evaluating the accuracy file")
+    run_evaluation(filename=filename, dataset=dataset)
+
 
 @app.command()
-def main(
+def benchmark(
     *,
     settings: Settings,
     model: Model,
     dataset: Dataset,
     endpoint: Endpoint,
-    evaluation: Evaluation,
     random_seed: Annotated[
         int,
         Option(help="The seed for the random number generator used by the benchmark."),
@@ -441,33 +445,27 @@ def main(
     ] = Verbosity.INFO,
 ) -> None:
     """Main CLI for running the VL2L benchmark."""
-    if evaluation.enable_evaluation:
-        logger.info("Evaluating the accuracy file")
-        evaluator = Evaluator(filename=evaluation.filename, dataset_cli=dataset)
-        evaluator.run_evaluation()
-    else:
-        logger.remove()
-        logger.add(sys.stdout, level=verbosity.value.upper())
-        logger.info("Running VL2L benchmark with settings: {}", settings)
-        logger.info("Running VL2L benchmark with model: {}", model)
-        logger.info("Running VL2L benchmark with dataset: {}", dataset)
-        logger.info(
-            "Running VL2L benchmark with OpenAI API endpoint: {}",
-            endpoint)
-        logger.info("Running VL2L benchmark with random seed: {}", random_seed)
-        test_settings, log_settings = settings.to_lgtype()
-        task = ShopifyGlobalCatalogue(
-            dataset_cli=dataset,
-            model_cli=model,
-            endpoint_cli=endpoint,
-            scenario=settings.test.scenario,
-            min_query_count=settings.test.min_query_count,
-            random_seed=random_seed,
-        )
-        sut = task.construct_sut()
-        qsl = task.construct_qsl()
-        logger.info("Starting the VL2L benchmark with LoadGen...")
-        lg.StartTestWithLogSettings(sut, qsl, test_settings, log_settings)
-        logger.info("The VL2L benchmark with LoadGen completed.")
-        lg.DestroyQSL(qsl)
-        lg.DestroySUT(sut)
+    logger.remove()
+    logger.add(sys.stdout, level=verbosity.value.upper())
+    logger.info("Running VL2L benchmark with settings: {}", settings)
+    logger.info("Running VL2L benchmark with model: {}", model)
+    logger.info("Running VL2L benchmark with dataset: {}", dataset)
+    logger.info(
+        "Running VL2L benchmark with OpenAI API endpoint: {}",
+        endpoint)
+    logger.info("Running VL2L benchmark with random seed: {}", random_seed)
+    test_settings, log_settings = settings.to_lgtype()
+    task = ShopifyGlobalCatalogue(
+        dataset_cli=dataset,
+        model_cli=model,
+        endpoint_cli=endpoint,
+        settings = settings.test,
+        random_seed=random_seed,
+    )
+    sut = task.construct_sut()
+    qsl = task.construct_qsl()
+    logger.info("Starting the VL2L benchmark with LoadGen...")
+    lg.StartTestWithLogSettings(sut, qsl, test_settings, log_settings)
+    logger.info("The VL2L benchmark with LoadGen completed.")
+    lg.DestroyQSL(qsl)
+    lg.DestroySUT(sut)
Original file line number	Diff line number	Diff line change
`@@ -21,6 +21,7 @@ dependencies = [`
`21`	`21`	`"typer",`
`22`	`22`	`"scikit-learn",`
`23`	`23`	`"tabulate",`
	`24`	`+ "hiclass",`
`24`	`25`	`]`
`25`	`26`	`dynamic = ["version"]`
`26`	`27`