additional fixes to reviews

johncalesp · johncalesp · commit 0dc13dc4115e · 2025-11-22T21:07:17.000-05:00
diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/cli.py b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/cli.py
@@ -18,7 +18,7 @@
 from .task import ShopifyGlobalCatalogue
 
 app = Typer()
-
+SplitType = Literal["train", "test"]
 
 class TestScenario(StrEnum):
     """The test scenario for the MLPerf inference LoadGen."""
@@ -378,9 +378,33 @@ class Dataset(BaseModel):
     ] = None
 
     split: Annotated[
-        Literal["train", "test"],
-        Field(description="choose between train or test split"),
-    ] = "train"
+        list[str],
+        Field(
+            description=(
+                "List of splits in order (e.g. ['train', 'test']). "
+                "Allowed values: 'train', 'test'."
+            ),
+        ),
+    ] = ["train"]
+
+    @field_validator("split", mode="before")
+    @classmethod
+    def normalize_and_validate_split(cls, v: str) -> list[str]:
+        """Normalize and validate the input string of field split."""
+        # Allow a single string like "train" or "train,test"
+        if isinstance(v, str):
+            v = [part.strip() for part in v.split(",") if part.strip()]
+
+        if not isinstance(v, list):
+            err="split must be a string or a list of strings"
+            raise TypeError(err)
+
+        allowed = {"train", "test"}
+        for item in v:
+            if item not in allowed:
+                msg_err =  f"Invalid split {item!r}. Must be one of: {sorted(allowed)}"
+                raise ValueError(msg_err)
+        return v
 
 
 class Verbosity(StrEnum):
diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/evaluation.py b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/evaluation.py
@@ -1,5 +1,7 @@
 """Task definitions for the VL2L benchmark."""
 
+from __future__ import annotations
+
 import json
 from pathlib import Path
 from typing import TYPE_CHECKING
@@ -8,11 +10,12 @@
 from datasets import load_dataset
 from hiclass.metrics import f1
 from loguru import logger
-from pydantic import FilePath
 from sklearn.metrics import f1_score
 from tabulate import tabulate
 
 if TYPE_CHECKING:
+    from pydantic import FilePath
+
     from .cli import Dataset as DatasetCLI
 
 
@@ -58,8 +61,8 @@ def get_hierarchical_components(predicted_path: str,
     return intersection_count, pred_length, true_length
 
 
-def calculate_hierarchical_metrics(data: list[tuple[str, str]]) -> float:
-    """Calculates the aggregate hP, hR, and hF scores for a list of samples.
+def calculate_hierarchical_f1(data: list[tuple[str, str]]) -> float:
+    """Calculates the aggregate hF scores for a list of samples.
 
     Args:
         data: A list of tuples, where each tuple is
@@ -107,9 +110,30 @@ def calculate_exact_match(generated_text: str, original_text: str) -> float:
 
     return 1.0 if gen == orig else 0.0
 
+def calculate_secondhand_f1(data: list[tuple[str, str]]) -> float:
+    """Calculate F1 score of is_secondhand field.
+
+    Args:
+         data: List of tuples of predicted and true values
+    Returs:
+        f1 score
+    """
+    y_pred = []
+    y_src = []
+    for pred, src in data:
+        y_pred.append(pred)
+        y_src.append(src)
+
+    return f1_score(y_src, y_pred)
+
+def calculate_hiclass_f1(data: list[tuple[str, str]]) -> float:
+    """Alt method to calculate hierarchical F1.
 
-def alt_f1_score(data: list[tuple[str, str]]) -> float:
-    """Alt method to calculate hierarchical F1."""
+    Args:
+         data: List of tuples of predicted and true values
+    Returs:
+        f1 score
+    """
     y_pred_raw = []
     y_true_raw = []
 
@@ -142,19 +166,19 @@ def alt_f1_score(data: list[tuple[str, str]]) -> float:
     return f1(y_true, y_pred)
 
 
-def run_evaluation(filename: FilePath, dataset: "DatasetCLI") -> None:
+def run_evaluation(filename: FilePath, dataset: DatasetCLI) -> None:
     """Main function to run the evaluation."""
     with Path.open(filename) as f:
         model_output = json.load(f)
 
     original_data = load_dataset(
         dataset.repo_id,
         dataset.token,
-    )[dataset.split]
+        split="+".join(dataset.split),
+    )
 
     category_dataset_pred_src = []
-    is_secondhand_pred = []
-    is_secondhand_src = []
+    is_secondhand_pred_src = []
     for elem in model_output:
         byte_data = bytes.fromhex(elem["data"])
         idx = elem["qsl_idx"]
@@ -163,15 +187,13 @@ def run_evaluation(filename: FilePath, dataset: "DatasetCLI") -> None:
         ground_truth_item = original_data[idx]
         category_dataset_pred_src.append((pred_item["category"],
                                           ground_truth_item["ground_truth_category"]))
-        is_secondhand_pred.append(int(pred_item["is_secondhand"]))
-        is_secondhand_src.append(
-            int(ground_truth_item["ground_truth_is_secondhand"]))
+        is_secondhand_pred_src.append((int(pred_item["is_secondhand"]),
+                                      int(ground_truth_item["ground_truth_is_secondhand"])))
 
-    category_f1_score = calculate_hierarchical_metrics(
+    category_f1_score = calculate_hierarchical_f1(
         category_dataset_pred_src)
-    hiclass_f1 = alt_f1_score(category_dataset_pred_src)
-    is_secondhand_f1_score = f1_score(is_secondhand_src,
-                                      is_secondhand_pred)
+    hiclass_f1 = calculate_hiclass_f1(category_dataset_pred_src)
+    is_secondhand_f1_score = calculate_secondhand_f1(is_secondhand_pred_src)
 
     data = [
         ["category", category_f1_score, hiclass_f1],
diff --git a/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/task.py b/multimodal/vl2l/src/mlperf_inference_multimodal_vl2l/task.py
@@ -46,15 +46,17 @@ def __init__(
             dataset_cli: The dataset configuration passed in from the CLI.
             model_cli: The model configuration passed in from the CLI.
             endpoint_cli: The endpoint configuration passed in from the CLI.
-            settings: Parameters of the current benchmark
+            settings: Parameters of the current benchmark.
             random_seed: The random seed to use for the task.
         """
         random.seed(random_seed)
         self.scenario = settings.scenario
         self.dataset = load_dataset(
             dataset_cli.repo_id,
             token=dataset_cli.token,
+            split="+".join(dataset_cli.split),
         )
+        logger.info(f"LEN: {len(self.dataset)}")
         self.model_cli = model_cli
         self.openai_api_client = AsyncOpenAI(
             base_url=endpoint_cli.url,
@@ -377,7 +379,7 @@ def __init__(
             dataset_cli: The dataset configuration passed in from the CLI.
             model_cli: The model configuration passed in from the CLI.
             endpoint_cli: The endpoint configuration passed in from the CLI.
-            settings: Parameters of the current benchmark
+            settings: Parameters of the current benchmark.
             random_seed: The random seed to use for the task.
         """
         super().__init__(
@@ -387,8 +389,6 @@ def __init__(
             settings=settings,
             random_seed=random_seed,
         )
-        # Shopify only released the train split so far.
-        self.dataset = self.dataset[dataset_cli.split]
 
     @staticmethod
     def formulate_messages(
@@ -409,44 +409,45 @@ def formulate_messages(
         return [
             {
                 "role": "system",
-                "content": """
-                Please analyze the product from the user prompt
-                and provide the following fields in a valid JSON object:
-                - category
-                - brand
-                - is_secondhand
-
-                You must choose only one, which is the most appropriate/correct,
-                category out of the list of possible product categories.
-
-                Your response should only contain a valid JSON object and nothing more.
-                The JSON object should match the followng JSON schema:
-                ```json
-                {
-                "type": "object",
-                "properties": {
-                    "category": {"type": "string"},
-                    "brand": {"type": "string"},
-                    "is_secondhand": {"type": "boolean"}
-                    }
-                }
-                ```
-                """,
+                "content": """Please analyze the product from the user prompt
+and provide the following fields in a valid JSON object:
+- category
+- brand
+- is_secondhand
+You must choose only one, which is the most appropriate/correct,
+category out of the list of possible product categories.
+Your response should only contain a valid JSON object and nothing more.
+The JSON object should match the followng JSON schema:
+```json
+{
+  "type": "object",
+  "properties": {
+    "category": {"type": "string"},
+    "brand": {"type": "string"},
+    "is_secondhand": {"type": "boolean"}
+  }
+}
+```
+""",
             },
             {
                 "role": "user",
                 "content": [
                     {
                         "type": "text",
-                        "text": (
-                            f"The title of the product is: {sample['product_title']}\n"
-                            f"The description of the product is: "
-                            f"{sample['product_description']}\n\n",
-                            "These are the possible product categories: ",
-                            f"{sample['potential_product_categories']}.",
-                            "You must choose only one and return the answer"
-                            " as string and not as a list",
-                        ),
+                        "text": f"""The title of the product is the following:
+```text
+{sample['product_title']}
+```
+The description of the product is the following:
+```text
+{sample['product_description']}
+```
+The following are the possible product categories:
+```json
+{sample['potential_product_categories']}
+```
+""",
                     },
                     {
                         "type": "image_url",