Introduce the mlperf-inf-mm-q3vl benchmark plugin system

wangshangsam · wangshangsam · commit 281b2e9a9ea0 · 2025-12-24T18:47:29.000-05:00
diff --git a/multimodal/qwen3-vl/README.md b/multimodal/qwen3-vl/README.md
@@ -268,6 +268,196 @@ bash submit.sh --help
     - Testing duration $\ge$ 10 mins.
     - Sample concatenation permutation is enabled.
 
+## Plugin System for `mlperf-inf-mm-q3vl benchmark`
+
+The `mlperf-inf-mm-q3vl` package supports a plugin system that allows third-party
+packages to register additional subcommands under `mlperf-inf-mm-q3vl benchmark`. This
+uses Python's standard entry points mechanism.
+
+The purpose of this feature is to allow benchmark result submitters to customize and fit
+`mlperf-inf-mm-q3vl` to the inference system that they would like to benchmark,
+**without** direct modification to the source code of `mlperf-inf-mm-q3vl` which is
+frozen after the benchmark being finalized.
+
+### How it works
+
+1. **Plugin Discovery**: When the CLI starts, it automatically discovers all registered
+plugins via the `mlperf_inf_mm_q3vl.benchmark_plugins` entry point group.
+2. **Plugin Loading**: Each plugin's entry point function is called to retrieve either a
+single command or a Typer app.
+3. **Command Registration**: The plugin's commands are automatically added to the
+`benchmark` subcommand group.
+
+### Example: creating a `mlperf-inf-mm-q3vl-foo` plugin package for `mlperf-inf-mm-q3vl benchmark foo`
+
+#### Step 1: Package Structure
+
+Create a new Python package with the following structure:
+
+```
+mlperf-inf-mm-q3vl-foo/
+├── pyproject.toml
+└── src/
+    └── mlperf_inf_mm_q3vl_foo/
+        ├── __init__.py
+        └── plugin.py
+```
+
+#### Step 2: Implement the `mlperf-inf-mm-q3vl-foo` plugin
+
+Create your plugin entry point function in `plugin.py`:
+
+```python
+"""Plugin to support benchmarking the Foo inference system."""
+
+from typing import Annotated
+from collections.abc import Callable
+from loguru import logger
+from pydantic_typer import Typer
+from typer import Option
+from mlperf_inf_mm_q3vl.schema import Settings, Dataset, Endpoint, Verbosity
+from mlperf_inf_mm_q3vl.log import setup_loguru_for_benchmark
+
+from .schema import FooEndpoint
+
+def register_foo_benchmark() -> Callable[[Settings, Dataset, FooEndpoint, int, int, Verbosity], None]:
+    """Entry point for the plugin to benchmark the Foo inference system.
+    
+    This function is called when the CLI discovers the plugin.
+    It should return either:
+    - A single command function (decorated with appropriate options)
+    - A tuple of (Typer app, command name) for more complex hierarchies
+    """
+
+    def benchmark_foo(
+        *,
+        settings: Settings,
+        dataset: Dataset,
+        # Add your foo-specific parameters here
+        foo: FooEndpoint,
+        custom_param: Annotated[
+            int,
+            Option(help="Custom parameter for foo backend"),
+        ] = 2,
+        random_seed: Annotated[
+            int,
+            Option(help="The seed for the random number generator."),
+        ] = 12345, 
+        verbosity: Annotated[
+            Verbosity,
+            Option(help="The verbosity level of the logger."),
+        ] = Verbosity.INFO,
+    ) -> None:
+        """Deploy and benchmark using Foo backend.
+        
+        This command deploys a model using the Foo backend
+        and runs the MLPerf benchmark against it.
+        """
+        from .deploy import FooDeployer
+
+        setup_loguru_for_benchmark(settings=settings, verbosity=verbosity)
+        logger.info(
+            f"Start to benchmark the Foo inference system with endpoint spec {} and custom param {}",
+            foo,
+            custom_param,
+        )
+        # Your implementation here
+        with FooDeployer(endpoint=foo, settings=settings, custom_param=custom_param):
+            # FooDeployer will make sure that Foo is deployed and currently healthy.
+            # Run benchmark using the core run_benchmark function
+            run_benchmark(
+                settings=settings,
+                dataset=dataset,
+                endpoint=vllm,
+                random_seed=random_seed,
+            )
+
+    # Return the command function
+    # The entry point name will be used as the subcommand name
+    return benchmark_foo
+```
+
+#### Step 3: Configure `pyproject.toml`
+
+Register the plugin in its package's `pyproject.toml`:
+
+```toml
+[project]
+name = "mlperf-inf-mm-q3vl-foo"
+version = "0.1.0"
+description = "Enable mlperf-inf-mm-q3vl to benchmark the Foo inference system."
+requires-python = ">=3.12"
+dependencies = [
+    "mlperf-inf-mm-q3vl @ git+https://github.com/mlcommons/inference.git#subdirectory=multimodal/qwen3-vl/",
+    # Add your backend-specific dependencies here
+]
+
+[project.entry-points."mlperf_inf_mm_q3vl.benchmark_plugins"]
+# The key here becomes the subcommand name.
+foo = "mlperf_inf_mm_q3vl_foo.plugin:register_foo_benchmark"
+
+[build-system]
+requires = ["setuptools>=80"]
+build-backend = "setuptools.build_meta"
+```
+
+#### Step 4: Install and use `mlperf-inf-mm-q3vl benchmark foo`
+
+```bash
+# Install your plugin package
+pip install mlperf-inf-mm-q3vl-foo
+
+# The new subcommand is now available
+mlperf-inf-mm-q3vl benchmark foo --help
+mlperf-inf-mm-q3vl benchmark foo \
+    --settings-file settings.toml \
+    --dataset shopify-global-catalogue \
+    --custom-param 3
+```
+
+#### Advanced: Nested Subcommands
+
+If you want to create multiple subcommands under a single plugin (e.g.,
+`mlperf-inf-mm-q3vl benchmark foo standard` and
+`mlperf-inf-mm-q3vl benchmark foo optimized`), return a tuple of `(Typer app, name)`:
+
+```python
+def register_foo_benchmark() -> tuple[Typer, str]:
+    """Entry point that creates nested subcommands."""
+    from pydantic_typer import Typer
+
+    # Create a Typer app for your plugin
+    foo_app = Typer(help="Benchmarking options for the Foo inference systems.")
+
+    @foo_app.command(name="standard")
+    def foo_standard(...) -> None:
+        """Run standard Foo benchmark."""
+        # Implementation
+        ...
+
+    @foo_app.command(name="optimized")
+    def foo_optimized(...) -> None:
+        """Run optimized Foo benchmark with max performance."""
+        # Implementation
+        ...
+    
+    # Return tuple of (app, command_name)
+    return (foo_app, "foo")
+```
+
+This will create:
+- `mlperf-inf-mm-q3vl benchmark foo standard`
+- `mlperf-inf-mm-q3vl benchmark foo optimized`
+
+### Best Practices
+
+1. Dependencies: Declare `mlperf-inf-mm-q3vl` as a dependency in your plugin package.
+2. Documentation: Provide clear docstrings for your plugin commands - they appear in
+`--help` output.
+3. Schema Reuse: Reuse the core `Settings`, `Dataset`, and other schemas from
+`mlperf_inf_mm_q3vl.schema` for consistency and minimizing boilerplate code.
+4. Lazy Imports: If your plugin has heavy dependencies, import them inside functions
+rather than at module level to avoid slowing down CLI startup
 
 ## Developer Guide
 
diff --git a/multimodal/qwen3-vl/scripts/slurm/submit.sh b/multimodal/qwen3-vl/scripts/slurm/submit.sh
@@ -99,12 +99,12 @@ while [[ $# -gt 0 ]]; do
         shift
         ;;
     -seq | --server-expected-qps)
-        server_expected_qps=$2
+        server_target_qps=$2
         shift
         shift
         ;;
     -seq=* | --server-expected-qps=*)
-        server_expected_qps=${1#*=}
+        server_target_qps=${1#*=}
         shift
         ;;
     -tps | --tensor-parallel-size)
diff --git a/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/cli.py b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/cli.py
@@ -2,6 +2,8 @@
 
 from __future__ import annotations
 
+from collections.abc import Sequence
+from importlib.metadata import entry_points
 from typing import Annotated
 
 import mlperf_loadgen as lg
@@ -24,6 +26,56 @@
     help="Main CLI for running the Qwen3-VL (Q3VL) benchmark.",
 )
 
+_PLUGIN_RESULT_APP_AND_NAME = 2
+
+
+def _load_benchmark_plugins() -> None:
+    """Load and register benchmark plugins from third-party packages."""
+    # Discover plugins from the entry point group
+    discovered_plugins = entry_points(group="mlperf_inf_mm_q3vl.benchmark_plugins")
+
+    for entry_point in discovered_plugins:
+        try:
+            # Load the plugin function
+            plugin_func = entry_point.load()
+
+            # Call the plugin function to get the command/typer app
+            plugin_result = plugin_func()
+
+            # Register it with the benchmark app
+            if (
+                isinstance(plugin_result, Sequence)
+                and len(plugin_result) == _PLUGIN_RESULT_APP_AND_NAME
+            ):
+                # Plugin returns (typer_app, name)
+                plugin_app, plugin_name = plugin_result
+                benchmark_app.add_typer(plugin_app, name=plugin_name)
+                logger.debug(
+                    "Loaded benchmark plugin: {} from {}",
+                    plugin_name,
+                    entry_point.name,
+                )
+            elif callable(plugin_result):
+                # Plugin returns just a command function
+                benchmark_app.command(name=entry_point.name)(plugin_result)
+                logger.debug("Loaded benchmark command: {}", entry_point.name)
+            else:
+                logger.warning(
+                    "Unsupported plugin function return type {} for plugin {}",
+                    type(plugin_result),
+                    entry_point.name,
+                )
+        except Exception as e:  # noqa: BLE001
+            logger.warning(
+                "Failed to load benchmark plugin {} with error: {}",
+                entry_point.name,
+                e,
+            )
+
+
+# Load plugins when the module is imported
+_load_benchmark_plugins()
+
 
 @app.command()
 def evaluate(
@@ -66,32 +118,28 @@ def benchmark_endpoint(
     accessible via a URL (and an API key, if applicable).
     """
     setup_loguru_for_benchmark(settings=settings, verbosity=verbosity)
-    _run_benchmark(
+    run_benchmark(
         settings=settings,
         dataset=dataset,
         endpoint=endpoint,
         random_seed=random_seed,
     )
 
 
-def _run_benchmark(
+def run_benchmark(
     settings: Settings,
     dataset: Dataset,
     endpoint: Endpoint,
     random_seed: int,
 ) -> None:
     """Run the Qwen3-VL (Q3VL) benchmark."""
-    logger.info(
-        "Running Qwen3-VL (Q3VL) benchmark with settings: {}",
-        settings)
+    logger.info("Running Qwen3-VL (Q3VL) benchmark with settings: {}", settings)
     logger.info("Running Qwen3-VL (Q3VL) benchmark with dataset: {}", dataset)
     logger.info(
         "Running Qwen3-VL (Q3VL) benchmark with OpenAI API endpoint: {}",
         endpoint,
     )
-    logger.info(
-        "Running Qwen3-VL (Q3VL) benchmark with random seed: {}",
-        random_seed)
+    logger.info("Running Qwen3-VL (Q3VL) benchmark with random seed: {}", random_seed)
     test_settings, log_settings = settings.to_lgtype()
     task = ShopifyGlobalCatalogue(
         dataset=dataset,
@@ -130,7 +178,7 @@ def benchmark_vllm(
     """
     setup_loguru_for_benchmark(settings=settings, verbosity=verbosity)
     with LocalVllmDeployer(endpoint=vllm, settings=settings):
-        _run_benchmark(
+        run_benchmark(
             settings=settings,
             dataset=dataset,
             endpoint=vllm,
diff --git a/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/task.py b/multimodal/qwen3-vl/src/mlperf_inf_mm_q3vl/task.py
@@ -67,8 +67,7 @@ def __init__(
         self.openai_api_client = AsyncOpenAI(
             base_url=endpoint.url,
             http_client=DefaultAioHttpClient(
-                timeout=httpx.Timeout(
-                    timeout=request_timeout_seconds, connect=5.0),
+                timeout=httpx.Timeout(timeout=request_timeout_seconds, connect=5.0),
             ),
             api_key=endpoint.api_key,
             timeout=request_timeout_seconds,
@@ -188,9 +187,7 @@ def estimated_num_performance_samples(self) -> int:
         """
         estimation_indices = random.sample(
             range(self.total_num_samples),
-            k=min(
-                MAX_NUM_ESTIMATION_PERFORMANCE_SAMPLES,
-                self.total_num_samples),
+            k=min(MAX_NUM_ESTIMATION_PERFORMANCE_SAMPLES, self.total_num_samples),
         )
         estimation_samples = [
             self.formulate_loaded_sample(
@@ -277,8 +274,7 @@ def _unload_samples_from_ram(query_sample_indices: list[int]) -> None:
             _unload_samples_from_ram,
         )
 
-    async def _query_endpoint_async_batch(
-            self, query_sample: lg.QuerySample) -> None:
+    async def _query_endpoint_async_batch(self, query_sample: lg.QuerySample) -> None:
         """Query the endpoint through the async OpenAI API client."""
         try:
             sample = self.loaded_samples[query_sample.index]
@@ -295,7 +291,7 @@ async def _query_endpoint_async_batch(
                 sample,
             )
             tic = time.perf_counter()
-            response = await self.openai_api_client.chat.completions.create(  # type: ignore[call-overload]
+            response = await self.openai_api_client.chat.completions.create(  # type: ignore[call-overload, misc]
                 model=self.endpoint.model.repo_id,
                 messages=sample.messages,
                 response_format=(
@@ -364,8 +360,7 @@ async def _query_endpoint_async_batch(
                 ],
             )
 
-    async def _query_endpoint_async_stream(
-            self, query_sample: lg.QuerySample) -> None:
+    async def _query_endpoint_async_stream(self, query_sample: lg.QuerySample) -> None:
         """Query the endpoint through the async OpenAI API client."""
         ttft_set = False
         try:
@@ -383,7 +378,7 @@ async def _query_endpoint_async_stream(
                 sample,
             )
             word_array = []
-            stream = await self.openai_api_client.chat.completions.create(  # type: ignore[call-overload]
+            stream = await self.openai_api_client.chat.completions.create(  # type: ignore[call-overload, misc]
                 stream=True,
                 model=self.endpoint.model.repo_id,
                 messages=sample.messages,