diff --git a/pylock.toml b/pylock.toml index c4a1545d..bbbf866a 100644 --- a/pylock.toml +++ b/pylock.toml @@ -167,64 +167,13 @@ dependencies = [ "importlib-metadata; python_version < \"3.8\"", ] -[[packages]] -name = "setuptools-git-versioning" -version = "2.1.0" -requires-python = ">=3.7" -sdist = {name = "setuptools_git_versioning-2.1.0.tar.gz", url = "https://files.pythonhosted.org/packages/f0/72/507b0b459b1fdbf5705aecbc5330c32d62dd41560718d2720bb6d94607f5/setuptools_git_versioning-2.1.0.tar.gz", hashes = {sha256 = "6aef5b8bb1cfb953b6b343d27cbfc561d96cf2a2ee23c2e0dd3591042a059921"}} -wheels = [ - {name = "setuptools_git_versioning-2.1.0-py3-none-any.whl",url = "https://files.pythonhosted.org/packages/c0/ba/daf16c2d1965bf6237fb696639e3e93645ac6801f7dcaf9ec694a74e9326/setuptools_git_versioning-2.1.0-py3-none-any.whl",hashes = {sha256 = "09a15cbb9a00884e91a3591a4c9ec1ff93c24b1b4a40de39a44815196beb7ebf"}}, -] -marker = "\"dev\" in extras" - -[packages.tool.pdm] -dependencies = [ - "packaging", - "setuptools", - "tomli>=2.0.1; python_version < \"3.11\"", -] - -[[packages]] -name = "build" -version = "1.3.0" -requires-python = ">=3.9" -sdist = {name = "build-1.3.0.tar.gz", url = "https://files.pythonhosted.org/packages/25/1c/23e33405a7c9eac261dff640926b8b5adaed6a6eb3e1767d441ed611d0c0/build-1.3.0.tar.gz", hashes = {sha256 = "698edd0ea270bde950f53aed21f3a0135672206f3911e0176261a31e0e07b397"}} -wheels = [ - {name = "build-1.3.0-py3-none-any.whl",url = "https://files.pythonhosted.org/packages/cb/8c/2b30c12155ad8de0cf641d76a8b396a16d2c36bc6d50b621a62b7c4567c1/build-1.3.0-py3-none-any.whl",hashes = {sha256 = "7145f0b5061ba90a1500d60bd1b13ca0a8a4cebdd0cc16ed8adf1c0e739f43b4"}}, -] -marker = "\"dev\" in extras" - -[packages.tool.pdm] -dependencies = [ - "packaging>=19.1", - "pyproject-hooks", - "colorama; os_name == \"nt\"", - "importlib-metadata>=4.6; python_full_version < \"3.10.2\"", - "tomli>=1.1.0; python_version < \"3.11\"", -] - -[[packages]] -name = "culsans" -version = "0.9.0" -requires-python = ">=3.8" -sdist = {name = "culsans-0.9.0.tar.gz", url = "https://files.pythonhosted.org/packages/90/5d/12e7e16b0caafaa8cca0728dd817204afd1274ddb35531b029b1c5cf7b2a/culsans-0.9.0.tar.gz", hashes = {sha256 = "942dd3c3c77f20e9ac3383d9a5ef8b7b24c0dac1a593bdb20d46c8a38720a5f3"}} -wheels = [ - {name = "culsans-0.9.0-py3-none-any.whl",url = "https://files.pythonhosted.org/packages/6f/b4/1e3cccb48f09e89e0cfc06925182cbcd36abf80b8eda2489430b41c7eaff/culsans-0.9.0-py3-none-any.whl",hashes = {sha256 = "d3537b65bbb341c2ac72e7d152deb8ab893b2a00452d2a68702a1a1a41619d6f"}}, -] -marker = "\"default\" in dependency_groups" - -[packages.tool.pdm] -dependencies = [ - "aiologic>=0.13.0", -] - [[packages]] name = "datasets" -version = "4.2.0" +version = "4.4.0" requires-python = ">=3.9.0" -sdist = {name = "datasets-4.2.0.tar.gz", url = "https://files.pythonhosted.org/packages/70/48/0186fbc4b86a4f9ecaf04eb01e877e78b53bfa0b03be9c84b2298431ba33/datasets-4.2.0.tar.gz", hashes = {sha256 = "8333a7db9f3bb8044c1b819a35d4e3e2809596c837793b0921382efffdc36e78"}} +sdist = {name = "datasets-4.4.0.tar.gz", url = "https://files.pythonhosted.org/packages/57/13/f05a80bbbac5f62e492e5e463ec59a4479647ef9c376b1fdfaa4d3ed01cc/datasets-4.4.0.tar.gz", hashes = {sha256 = "0430d39b9f13b53c37afb80c23c7e5d8c6ceccc014c14a14d15fa2b4e8688d2a"}} wheels = [ - {name = "datasets-4.2.0-py3-none-any.whl",url = "https://files.pythonhosted.org/packages/91/9e/0bbbd09b116fd8ee2d3617e28e6598551d2f0f24d3a2ce99cc87ec85aeb0/datasets-4.2.0-py3-none-any.whl",hashes = {sha256 = "fdc43aaf4a73b31f64f80f72f195ab413a1141ed15555d675b2fd17926f8b026"}}, + {name = "datasets-4.4.0-py3-none-any.whl",url = "https://files.pythonhosted.org/packages/1e/31/d552336985f747b19f0a852d98ca7a2ef4727ba956b38041cfbda08dde0a/datasets-4.4.0-py3-none-any.whl",hashes = {sha256 = "b7e6d1d48c2e1d3a95d6b378e8fc3d7ab29f24f14ddf505a8d417dd09c692f19"}}, ] marker = "\"default\" in dependency_groups or \"all\" in extras or \"audio\" in extras or \"dev\" in extras or \"vision\" in extras" @@ -239,8 +188,8 @@ dependencies = [ "httpx<1.0.0", "tqdm>=4.66.3", "xxhash", - "multiprocess<0.70.17", - "fsspec[http]<=2025.9.0,>=2023.1.0", + "multiprocess<0.70.19", + "fsspec[http]<=2025.10.0,>=2023.1.0", "huggingface-hub<2.0,>=0.25.0", "packaging", "pyyaml>=5.1", @@ -381,6 +330,57 @@ marker = "\"default\" in dependency_groups or \"all\" in extras or \"audio\" in [packages.tool.pdm] dependencies = [] +[[packages]] +name = "setuptools-git-versioning" +version = "2.1.0" +requires-python = ">=3.7" +sdist = {name = "setuptools_git_versioning-2.1.0.tar.gz", url = "https://files.pythonhosted.org/packages/f0/72/507b0b459b1fdbf5705aecbc5330c32d62dd41560718d2720bb6d94607f5/setuptools_git_versioning-2.1.0.tar.gz", hashes = {sha256 = "6aef5b8bb1cfb953b6b343d27cbfc561d96cf2a2ee23c2e0dd3591042a059921"}} +wheels = [ + {name = "setuptools_git_versioning-2.1.0-py3-none-any.whl",url = "https://files.pythonhosted.org/packages/c0/ba/daf16c2d1965bf6237fb696639e3e93645ac6801f7dcaf9ec694a74e9326/setuptools_git_versioning-2.1.0-py3-none-any.whl",hashes = {sha256 = "09a15cbb9a00884e91a3591a4c9ec1ff93c24b1b4a40de39a44815196beb7ebf"}}, +] +marker = "\"dev\" in extras" + +[packages.tool.pdm] +dependencies = [ + "packaging", + "setuptools", + "tomli>=2.0.1; python_version < \"3.11\"", +] + +[[packages]] +name = "build" +version = "1.3.0" +requires-python = ">=3.9" +sdist = {name = "build-1.3.0.tar.gz", url = "https://files.pythonhosted.org/packages/25/1c/23e33405a7c9eac261dff640926b8b5adaed6a6eb3e1767d441ed611d0c0/build-1.3.0.tar.gz", hashes = {sha256 = "698edd0ea270bde950f53aed21f3a0135672206f3911e0176261a31e0e07b397"}} +wheels = [ + {name = "build-1.3.0-py3-none-any.whl",url = "https://files.pythonhosted.org/packages/cb/8c/2b30c12155ad8de0cf641d76a8b396a16d2c36bc6d50b621a62b7c4567c1/build-1.3.0-py3-none-any.whl",hashes = {sha256 = "7145f0b5061ba90a1500d60bd1b13ca0a8a4cebdd0cc16ed8adf1c0e739f43b4"}}, +] +marker = "\"dev\" in extras" + +[packages.tool.pdm] +dependencies = [ + "packaging>=19.1", + "pyproject-hooks", + "colorama; os_name == \"nt\"", + "importlib-metadata>=4.6; python_full_version < \"3.10.2\"", + "tomli>=1.1.0; python_version < \"3.11\"", +] + +[[packages]] +name = "culsans" +version = "0.9.0" +requires-python = ">=3.8" +sdist = {name = "culsans-0.9.0.tar.gz", url = "https://files.pythonhosted.org/packages/90/5d/12e7e16b0caafaa8cca0728dd817204afd1274ddb35531b029b1c5cf7b2a/culsans-0.9.0.tar.gz", hashes = {sha256 = "942dd3c3c77f20e9ac3383d9a5ef8b7b24c0dac1a593bdb20d46c8a38720a5f3"}} +wheels = [ + {name = "culsans-0.9.0-py3-none-any.whl",url = "https://files.pythonhosted.org/packages/6f/b4/1e3cccb48f09e89e0cfc06925182cbcd36abf80b8eda2489430b41c7eaff/culsans-0.9.0-py3-none-any.whl",hashes = {sha256 = "d3537b65bbb341c2ac72e7d152deb8ab893b2a00452d2a68702a1a1a41619d6f"}}, +] +marker = "\"default\" in dependency_groups" + +[packages.tool.pdm] +dependencies = [ + "aiologic>=0.13.0", +] + [[packages]] name = "ftfy" version = "6.3.1" @@ -1247,6 +1247,22 @@ marker = "\"all\" in extras or \"dev\" in extras or \"perf\" in extras or \"reco [packages.tool.pdm] dependencies = [] +[[packages]] +name = "pandas-stubs" +version = "2.3.2.250926" +requires-python = ">=3.10" +sdist = {name = "pandas_stubs-2.3.2.250926.tar.gz", url = "https://files.pythonhosted.org/packages/1b/3b/32be58a125db39d0b5f62cc93795f32b5bb2915bd5c4a46f0e35171985e2/pandas_stubs-2.3.2.250926.tar.gz", hashes = {sha256 = "c64b9932760ceefb96a3222b953e6a251321a9832a28548be6506df473a66406"}} +wheels = [ + {name = "pandas_stubs-2.3.2.250926-py3-none-any.whl",url = "https://files.pythonhosted.org/packages/40/96/1e4a035eaf4dce9610aac6e43026d0c6baa05773daf6d21e635a4fe19e21/pandas_stubs-2.3.2.250926-py3-none-any.whl",hashes = {sha256 = "81121818453dcfe00f45c852f4dceee043640b813830f6e7bd084a4ef7ff7270"}}, +] +marker = "\"dev\" in extras" + +[packages.tool.pdm] +dependencies = [ + "numpy>=1.23.5", + "types-pytz>=2022.1.1", +] + [[packages]] name = "protobuf" version = "6.32.1" @@ -1306,6 +1322,19 @@ dependencies = [ "setuptools>=70.1.0", ] +[[packages]] +name = "tabulate" +version = "0.9.0" +requires-python = ">=3.7" +sdist = {name = "tabulate-0.9.0.tar.gz", url = "https://files.pythonhosted.org/packages/ec/fe/802052aecb21e3797b8f7902564ab6ea0d60ff8ca23952079064155d1ae1/tabulate-0.9.0.tar.gz", hashes = {sha256 = "0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c"}} +wheels = [ + {name = "tabulate-0.9.0-py3-none-any.whl",url = "https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl",hashes = {sha256 = "024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f"}}, +] +marker = "\"default\" in dependency_groups" + +[packages.tool.pdm] +dependencies = [] + [[packages]] name = "transformers" version = "4.57.1" @@ -1781,11 +1810,11 @@ dependencies = [] [[packages]] name = "fsspec" -version = "2025.9.0" +version = "2025.10.0" requires-python = ">=3.9" -sdist = {name = "fsspec-2025.9.0.tar.gz", url = "https://files.pythonhosted.org/packages/de/e0/bab50af11c2d75c9c4a2a26a5254573c0bd97cea152254401510950486fa/fsspec-2025.9.0.tar.gz", hashes = {sha256 = "19fd429483d25d28b65ec68f9f4adc16c17ea2c7c7bf54ec61360d478fb19c19"}} +sdist = {name = "fsspec-2025.10.0.tar.gz", url = "https://files.pythonhosted.org/packages/24/7f/2747c0d332b9acfa75dc84447a066fdf812b5a6b8d30472b74d309bfe8cb/fsspec-2025.10.0.tar.gz", hashes = {sha256 = "b6789427626f068f9a83ca4e8a3cc050850b6c0f71f99ddb4f542b8266a26a59"}} wheels = [ - {name = "fsspec-2025.9.0-py3-none-any.whl",url = "https://files.pythonhosted.org/packages/47/71/70db47e4f6ce3e5c37a607355f80da8860a33226be640226ac52cb05ef2e/fsspec-2025.9.0-py3-none-any.whl",hashes = {sha256 = "530dc2a2af60a414a832059574df4a6e10cce927f6f4a78209390fe38955cfb7"}}, + {name = "fsspec-2025.10.0-py3-none-any.whl",url = "https://files.pythonhosted.org/packages/eb/02/a6b21098b1d5d6249b7c5ab69dde30108a71e4e819d4a9778f1de1d5b70d/fsspec-2025.10.0-py3-none-any.whl",hashes = {sha256 = "7c7712353ae7d875407f97715f0e1ffcc21e33d5b24556cb1e090ae9409ec61d"}}, ] marker = "\"default\" in dependency_groups or \"all\" in extras or \"audio\" in extras or \"dev\" in extras or \"vision\" in extras" @@ -3596,6 +3625,19 @@ dependencies = [ "html5tagger>=1.2.1", ] +[[packages]] +name = "types-pytz" +version = "2025.2.0.20250809" +requires-python = ">=3.9" +sdist = {name = "types_pytz-2025.2.0.20250809.tar.gz", url = "https://files.pythonhosted.org/packages/07/e2/c774f754de26848f53f05defff5bb21dd9375a059d1ba5b5ea943cf8206e/types_pytz-2025.2.0.20250809.tar.gz", hashes = {sha256 = "222e32e6a29bb28871f8834e8785e3801f2dc4441c715cd2082b271eecbe21e5"}} +wheels = [ + {name = "types_pytz-2025.2.0.20250809-py3-none-any.whl",url = "https://files.pythonhosted.org/packages/db/d0/91c24fe54e565f2344d7a6821e6c6bb099841ef09007ea6321a0bac0f808/types_pytz-2025.2.0.20250809-py3-none-any.whl",hashes = {sha256 = "4f55ed1b43e925cf851a756fe1707e0f5deeb1976e15bf844bcaa025e8fbd0db"}}, +] +marker = "\"dev\" in extras" + +[packages.tool.pdm] +dependencies = [] + [[packages]] name = "ujson" version = "5.11.0" @@ -4464,7 +4506,7 @@ marker = "python_full_version >= \"3.10.0\" and python_full_version < \"3.10.2\" dependencies = [] [tool.pdm] -hashes = {sha256 = "a61aad0c4563f9e4a33622000214136c2a7aa01d28a2e89e220a415039e7e3eb"} +hashes = {sha256 = "78b9a92a016e9cc24989f5691183181d059a55ee416647f9e8d00bd35cd38c35"} strategy = ["inherit_metadata", "static_urls"] [[tool.pdm.targets]] diff --git a/pyproject.toml b/pyproject.toml index 1ba5a92f..6583b1b4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -69,6 +69,7 @@ dependencies = [ "pyyaml>=6.0.0", "rich", "sanic", + "tabulate", "transformers", "uvloop>=0.18", "torch", @@ -128,6 +129,7 @@ dev = [ "mdformat-gfm~=0.3.6", # type-checking + "pandas-stubs", "types-PyYAML~=6.0.1", "types-requests~=2.32.0", "types-toml", diff --git a/src/guidellm/__main__.py b/src/guidellm/__main__.py index 1faaaafa..3a2bd6c4 100644 --- a/src/guidellm/__main__.py +++ b/src/guidellm/__main__.py @@ -45,26 +45,12 @@ reimport_benchmarks_report, ) from guidellm.mock_server import MockServer, MockServerConfig -from guidellm.preprocess.dataset import ShortPromptStrategy, process_dataset from guidellm.scheduler import StrategyType from guidellm.schemas import GenerativeRequestType from guidellm.settings import print_config from guidellm.utils import Console, DefaultGroupHandler, get_literal_vals from guidellm.utils import cli as cli_tools -__all__ = [ - "STRATEGY_PROFILE_CHOICES", - "benchmark", - "cli", - "config", - "dataset", - "decode_escaped_str", - "from_file", - "mock_server", - "preprocess", - "run", -] - STRATEGY_PROFILE_CHOICES: list[str] = list(get_literal_vals(ProfileType | StrategyType)) """Available strategy and profile type choices for benchmark execution.""" @@ -256,7 +242,7 @@ def benchmark(): help="Number of worker processes for data loading.", ) @click.option( - "--dataloader_kwargs", + "--dataloader-kwargs", default=BenchmarkGenerativeTextArgs.get_default("dataloader_kwargs"), callback=cli_tools.parse_json, help="JSON string of arguments to pass to the dataloader constructor.", @@ -469,128 +455,6 @@ def preprocess(): """Dataset preprocessing utilities.""" -@preprocess.command( - "dataset", - help=( - "Process a dataset to have specific prompt and output token sizes. " - "Supports multiple strategies for handling prompts and optional " - "Hugging Face Hub upload.\n\n" - "DATA: Path to the input dataset or dataset ID.\n\n" - "OUTPUT_PATH: Path to save the processed dataset, including file suffix." - ), - context_settings={"auto_envvar_prefix": "GUIDELLM"}, -) -@click.argument( - "data", - type=str, - required=True, -) -@click.argument( - "output_path", - type=click.Path(file_okay=True, dir_okay=False, writable=True, resolve_path=True), - required=True, -) -@click.option( - "--processor", - type=str, - required=True, - help="Processor or tokenizer name for calculating token counts.", -) -@click.option( - "--processor-args", - default=None, - callback=cli_tools.parse_json, - help="JSON string of arguments to pass to the processor constructor.", -) -@click.option( - "--data-args", - callback=cli_tools.parse_json, - help="JSON string of arguments to pass to dataset creation.", -) -@click.option( - "--short-prompt-strategy", - type=click.Choice([s.value for s in ShortPromptStrategy]), - default=ShortPromptStrategy.IGNORE.value, - show_default=True, - help="Strategy for handling prompts shorter than target length.", -) -@click.option( - "--pad-char", - type=str, - default="", - callback=decode_escaped_str, - help="Character to pad short prompts with when using 'pad' strategy.", -) -@click.option( - "--concat-delimiter", - type=str, - default="", - help=( - "Delimiter for concatenating short prompts (used with 'concatenate' strategy)." - ), -) -@click.option( - "--prompt-tokens", - type=str, - default=None, - help="Prompt tokens configuration (JSON, YAML file, or key=value string).", -) -@click.option( - "--output-tokens", - type=str, - default=None, - help="Output tokens configuration (JSON, YAML file, or key=value string).", -) -@click.option( - "--push-to-hub", - is_flag=True, - help="Push the processed dataset to Hugging Face Hub.", -) -@click.option( - "--hub-dataset-id", - type=str, - default=None, - help=("Hugging Face Hub dataset ID for upload (required if --push-to-hub is set)."), -) -@click.option( - "--random-seed", - type=int, - default=42, - show_default=True, - help="Random seed for reproducible token sampling.", -) -def dataset( - data, - output_path, - processor, - processor_args, - data_args, - short_prompt_strategy, - pad_char, - concat_delimiter, - prompt_tokens, - output_tokens, - push_to_hub, - hub_dataset_id, - random_seed, -): - process_dataset( - data=data, - output_path=output_path, - processor=processor, - prompt_tokens=prompt_tokens, - output_tokens=output_tokens, - processor_args=processor_args, - data_args=data_args, - short_prompt_strategy=short_prompt_strategy, - pad_char=pad_char, - concat_delimiter=concat_delimiter, - push_to_hub=push_to_hub, - hub_dataset_id=hub_dataset_id, - random_seed=random_seed, - ) - - @cli.command( "mock-server", help=( diff --git a/src/guidellm/backends/backend.py b/src/guidellm/backends/backend.py index 89169a48..bc3fe37a 100644 --- a/src/guidellm/backends/backend.py +++ b/src/guidellm/backends/backend.py @@ -102,9 +102,8 @@ def requests_limit(self) -> int | None: return None @abstractmethod - async def default_model(self) -> str | None: + async def default_model(self) -> str: """ :return: The default model name or identifier for generation requests, - None if no default model is available """ ... diff --git a/src/guidellm/backends/openai.py b/src/guidellm/backends/openai.py index 1e74fc6e..22b411ae 100644 --- a/src/guidellm/backends/openai.py +++ b/src/guidellm/backends/openai.py @@ -54,7 +54,7 @@ class OpenAIHTTPBackend(Backend): def __init__( self, target: str, - model: str | None = None, + model: str = "", api_routes: dict[str, str] | None = None, response_handlers: dict[str, Any] | None = None, timeout: float = 60.0, @@ -192,7 +192,7 @@ async def available_models(self) -> list[str]: return [item["id"] for item in response.json()["data"]] - async def default_model(self) -> str | None: + async def default_model(self) -> str: """ Get the default model for this backend. @@ -202,9 +202,9 @@ async def default_model(self) -> str | None: return self.model models = await self.available_models() - return models[0] if models else None + return models[0] if models else "" - async def resolve( + async def resolve( # type: ignore[override] self, request: GenerationRequest, request_info: RequestInfo, @@ -230,11 +230,9 @@ async def resolve( if history is not None: raise NotImplementedError("Multi-turn requests not yet supported") - response_handler = self._resolve_response_handler( - request_type=request.request_type - ) if (request_path := self.api_routes.get(request.request_type)) is None: raise ValueError(f"Unsupported request type '{request.request_type}'") + request_url = f"{self.target}/{request_path}" request_files = ( { @@ -246,6 +244,9 @@ async def resolve( ) request_json = request.arguments.body if not request_files else None request_data = request.arguments.body if request_files else None + response_handler = self._resolve_response_handler( + request_type=request.request_type + ) if not request.arguments.stream: request_info.timings.request_start = time.time() @@ -282,24 +283,22 @@ async def resolve( async for chunk in stream.aiter_lines(): iter_time = time.time() - if ( - (iterations := response_handler.add_streaming_line(chunk)) - is None - or iterations < 0 - or end_reached - ): + if request_info.timings.first_request_iteration is None: + request_info.timings.first_request_iteration = iter_time + request_info.timings.last_request_iteration = iter_time + request_info.timings.request_iterations += 1 + + iterations = response_handler.add_streaming_line(chunk) + if iterations is None or iterations <= 0 or end_reached: end_reached = end_reached or iterations is None continue - if ( - request_info.timings.first_iteration is None - or request_info.timings.iterations is None - ): - request_info.timings.first_iteration = iter_time - request_info.timings.iterations = 0 + if request_info.timings.first_token_iteration is None: + request_info.timings.first_token_iteration = iter_time + request_info.timings.token_iterations = 0 - request_info.timings.last_iteration = iter_time - request_info.timings.iterations += iterations + request_info.timings.last_token_iteration = iter_time + request_info.timings.token_iterations += iterations request_info.timings.request_end = time.time() yield response_handler.compile_streaming(request), request_info diff --git a/src/guidellm/backends/response_handlers.py b/src/guidellm/backends/response_handlers.py index b7bd06ad..18aaf320 100644 --- a/src/guidellm/backends/response_handlers.py +++ b/src/guidellm/backends/response_handlers.py @@ -9,7 +9,7 @@ from __future__ import annotations -from typing import Any, Protocol +from typing import Any, Protocol, cast from guidellm.schemas import GenerationRequest, GenerationResponse, UsageMetrics from guidellm.utils import RegistryMixin, json @@ -109,14 +109,15 @@ def compile_non_streaming( :return: Standardized GenerationResponse with extracted text and metrics """ choices, usage = self.extract_choices_and_usage(response) - input_metrics, output_metrics = self.extract_metrics(usage) + text = choices[0].get("text", "") if choices else "" + input_metrics, output_metrics = self.extract_metrics(usage, text) return GenerationResponse( request_id=request.request_id, request_args=str( request.arguments.model_dump() if request.arguments else None ), - text=choices[0].get("text", "") if choices else "", + text=text, input_metrics=input_metrics, output_metrics=output_metrics, ) @@ -137,7 +138,7 @@ def add_streaming_line(self, line: str) -> int | None: updated = False choices, usage = self.extract_choices_and_usage(data) - if text := choices[0].get("text"): + if choices and (text := choices[0].get("text")): self.streaming_texts.append(text) updated = True @@ -153,14 +154,15 @@ def compile_streaming(self, request: GenerationRequest) -> GenerationResponse: :param request: Original generation request :return: Standardized GenerationResponse with concatenated text and metrics """ - input_metrics, output_metrics = self.extract_metrics(self.streaming_usage) + text = "".join(self.streaming_texts) + input_metrics, output_metrics = self.extract_metrics(self.streaming_usage, text) return GenerationResponse( request_id=request.request_id, request_args=str( request.arguments.model_dump() if request.arguments else None ), - text="".join(self.streaming_texts), + text=text, input_metrics=input_metrics, output_metrics=output_metrics, ) @@ -194,25 +196,34 @@ def extract_choices_and_usage( return response.get("choices", []), response.get("usage", {}) def extract_metrics( - self, usage: dict[str, int | dict[str, int]] | None + self, usage: dict[str, int | dict[str, int]] | None, text: str ) -> tuple[UsageMetrics, UsageMetrics]: """ Extract input and output usage metrics from API response usage data. :param usage: Usage data dictionary from API response + :param text: Generated text for calculating word and character counts :return: Tuple of input_metrics and output_metrics as UsageMetrics objects """ if not usage: - return UsageMetrics(), UsageMetrics() + return UsageMetrics(), UsageMetrics( + text_words=len(text.split()) if text else 0, + text_characters=len(text) if text else 0, + ) - input_details: dict[str, int] = usage.get("prompt_tokens_details", {}) or {} - output_details: dict[str, int] = ( - usage.get("completion_tokens_details", {}) or {} + input_details: dict[str, int] = cast( + "dict[str, int]", usage.get("prompt_tokens_details", {}) or {} + ) + output_details: dict[str, int] = cast( + "dict[str, int]", usage.get("completion_tokens_details", {}) or {} ) + usage_metrics: dict[str, int] = cast("dict[str, int]", usage) return UsageMetrics( text_tokens=( - input_details.get("prompt_tokens") or usage.get("prompt_tokens") + input_details.get("prompt_tokens") + or usage_metrics.get("prompt_tokens") + or 0 ), image_tokens=input_details.get("image_tokens"), video_tokens=input_details.get("video_tokens"), @@ -221,8 +232,11 @@ def extract_metrics( ), UsageMetrics( text_tokens=( output_details.get("completion_tokens") - or usage.get("completion_tokens") + or usage_metrics.get("completion_tokens") + or 0 ), + text_words=len(text.split()) if text else 0, + text_characters=len(text) if text else 0, image_tokens=output_details.get("image_tokens"), video_tokens=output_details.get("video_tokens"), audio_tokens=output_details.get("audio_tokens"), @@ -254,14 +268,16 @@ def compile_non_streaming( :return: Standardized GenerationResponse with extracted content and metrics """ choices, usage = self.extract_choices_and_usage(response) - input_metrics, output_metrics = self.extract_metrics(usage) + choice = choices[0] if choices else {} + text = choice.get("content", "") + input_metrics, output_metrics = self.extract_metrics(usage, text) return GenerationResponse( request_id=request.request_id, request_args=str( request.arguments.model_dump() if request.arguments else None ), - text=(choices[0].get("message", {}).get("content", "") if choices else ""), + text=text, input_metrics=input_metrics, output_metrics=output_metrics, ) @@ -298,14 +314,15 @@ def compile_streaming(self, request: GenerationRequest) -> GenerationResponse: :param request: Original generation request :return: Standardized GenerationResponse with concatenated content and metrics """ - input_metrics, output_metrics = self.extract_metrics(self.streaming_usage) + text = "".join(self.streaming_texts) + input_metrics, output_metrics = self.extract_metrics(self.streaming_usage, text) return GenerationResponse( request_id=request.request_id, request_args=str( request.arguments.model_dump() if request.arguments else None ), - text="".join(self.streaming_texts), + text=text, input_metrics=input_metrics, output_metrics=output_metrics, ) @@ -352,10 +369,9 @@ def compile_non_streaming( :param response: Complete API response containing text and usage data :return: Standardized GenerationResponse with extracted text and metrics """ - usage: dict[str, int | dict[str, int]] = response.get("usage", {}) - input_details: dict[str, int] = usage.get("input_token_details", {}) or {} - output_details: dict[str, int] = usage.get("output_token_details", {}) or {} text: str = response.get("text", "") + usage: dict[str, int | dict[str, int]] = response.get("usage", {}) + input_metrics, output_metrics = self.extract_metrics(usage, text) return GenerationResponse( request_id=request.request_id, @@ -363,18 +379,8 @@ def compile_non_streaming( request.arguments.model_dump() if request.arguments else None ), text=text, - input_metrics=UsageMetrics( - text_tokens=input_details.get("text_tokens", usage.get("input_tokens")), - audio_tokens=input_details.get( - "audio_tokens", usage.get("input_tokens") - ), - audio_seconds=input_details.get("seconds", usage.get("seconds")), - ), - output_metrics=UsageMetrics( - text_tokens=output_details.get( - "text_tokens", usage.get("output_tokens") - ), - ), + input_metrics=input_metrics, + output_metrics=output_metrics, ) def add_streaming_line(self, line: str) -> int | None: @@ -394,8 +400,6 @@ def add_streaming_line(self, line: str) -> int | None: return 0 data: dict[str, Any] = json.loads(line) - text: str - usage: dict[str, int | dict[str, int]] updated = False if text := data.get("text"): @@ -414,20 +418,21 @@ def compile_streaming(self, request: GenerationRequest) -> GenerationResponse: :param request: Original generation request :return: Standardized GenerationResponse with concatenated text and metrics """ - input_metrics, output_metrics = self.extract_metrics(self.streaming_usage) + text = "".join(self.streaming_texts) + input_metrics, output_metrics = self.extract_metrics(self.streaming_usage, text) return GenerationResponse( request_id=request.request_id, request_args=str( request.arguments.model_dump() if request.arguments else None ), - text="".join(self.streaming_texts), + text=text, input_metrics=input_metrics, output_metrics=output_metrics, ) def extract_metrics( - self, usage: dict[str, int | dict[str, int]] | None + self, usage: dict[str, int | dict[str, int]] | None, text: str ) -> tuple[UsageMetrics, UsageMetrics]: """ Extract input and output usage metrics from audio API response usage data. @@ -436,20 +441,40 @@ def extract_metrics( in addition to standard text token counts. :param usage: Usage data dictionary from audio API response + :param text: Generated text for calculating word and character counts :return: Tuple of input_metrics and output_metrics as UsageMetrics objects """ if not usage: - return UsageMetrics(), UsageMetrics() + return UsageMetrics(), UsageMetrics( + text_words=len(text.split()) if text else 0, + text_characters=len(text) if text else 0, + ) - input_details: dict[str, int] = usage.get("input_token_details", {}) or {} - output_details: dict[str, int] = usage.get("output_token_details", {}) or {} + input_details: dict[str, int] = cast( + "dict[str, int]", usage.get("input_token_details", {}) or {} + ) + output_details: dict[str, int] = cast( + "dict[str, int]", usage.get("output_token_details", {}) or {} + ) + usage_metrics: dict[str, int] = cast("dict[str, int]", usage) return UsageMetrics( - text_tokens=(input_details.get("text_tokens") or usage.get("input_tokens")), + text_tokens=input_details.get("text_tokens") or 0, audio_tokens=( - input_details.get("audio_tokens") or usage.get("audio_tokens") + input_details.get("audio_tokens") + or usage_metrics.get("audio_tokens") + or usage_metrics.get("input_tokens") + or 0 + ), + audio_seconds=( + input_details.get("seconds") or usage_metrics.get("seconds") or 0 ), - audio_seconds=(input_details.get("seconds") or usage.get("seconds")), ), UsageMetrics( - text_tokens=output_details.get("text_tokens") or usage.get("output_tokens"), + text_tokens=( + output_details.get("text_tokens") + or usage_metrics.get("output_tokens") + or 0 + ), + text_words=len(text.split()) if text else 0, + text_characters=len(text) if text else 0, ) diff --git a/src/guidellm/benchmark/__init__.py b/src/guidellm/benchmark/__init__.py index ef7b2900..ed153881 100644 --- a/src/guidellm/benchmark/__init__.py +++ b/src/guidellm/benchmark/__init__.py @@ -12,7 +12,7 @@ from .benchmarker import Benchmarker from .entrypoints import benchmark_generative_text, reimport_benchmarks_report -from .output import ( +from .outputs import ( GenerativeBenchmarkerConsole, GenerativeBenchmarkerCSV, GenerativeBenchmarkerHTML, @@ -31,34 +31,43 @@ from .scenarios import get_builtin_scenarios from .schemas import ( Benchmark, - BenchmarkerArgs, - BenchmarkerDict, + BenchmarkAccumulator, + BenchmarkAccumulatorT, + BenchmarkConfig, BenchmarkGenerativeTextArgs, - BenchmarkSchedulerStats, - EstimatedBenchmarkState, + BenchmarkT, GenerativeAudioMetricsSummary, GenerativeBenchmark, + GenerativeBenchmarkAccumulator, GenerativeBenchmarksReport, + GenerativeBenchmarkTimings, GenerativeImageMetricsSummary, GenerativeMetrics, + GenerativeMetricsAccumulator, GenerativeMetricsSummary, + GenerativeRequestsAccumulator, + GenerativeTextMetricsSummary, GenerativeVideoMetricsSummary, - SchedulerDict, + RunningMetricStats, + SchedulerMetrics, + SchedulerMetricsAccumulator, ) __all__ = [ "AsyncProfile", "Benchmark", + "BenchmarkAccumulator", + "BenchmarkAccumulatorT", + "BenchmarkConfig", "BenchmarkGenerativeTextArgs", - "BenchmarkSchedulerStats", + "BenchmarkT", "Benchmarker", - "BenchmarkerArgs", - "BenchmarkerDict", "BenchmarkerProgress", "ConcurrentProfile", - "EstimatedBenchmarkState", "GenerativeAudioMetricsSummary", "GenerativeBenchmark", + "GenerativeBenchmarkAccumulator", + "GenerativeBenchmarkTimings", "GenerativeBenchmarkerCSV", "GenerativeBenchmarkerConsole", "GenerativeBenchmarkerHTML", @@ -67,11 +76,16 @@ "GenerativeConsoleBenchmarkerProgress", "GenerativeImageMetricsSummary", "GenerativeMetrics", + "GenerativeMetricsAccumulator", "GenerativeMetricsSummary", + "GenerativeRequestsAccumulator", + "GenerativeTextMetricsSummary", "GenerativeVideoMetricsSummary", "Profile", "ProfileType", - "SchedulerDict", + "RunningMetricStats", + "SchedulerMetrics", + "SchedulerMetricsAccumulator", "SweepProfile", "SynchronousProfile", "ThroughputProfile", diff --git a/src/guidellm/benchmark/benchmarker.py b/src/guidellm/benchmark/benchmarker.py index 8a46d44e..2195ea59 100644 --- a/src/guidellm/benchmark/benchmarker.py +++ b/src/guidellm/benchmark/benchmarker.py @@ -1,11 +1,11 @@ """ Benchmark execution orchestration and lifecycle management. -Provides the core benchmarking engine that coordinates request scheduling, -data aggregation, and result compilation across different execution strategies -and environments. The Benchmarker acts as the primary workflow coordinator, -managing the complete benchmark lifecycle from request submission through -result compilation while supporting thread-safe singleton operations. +Provides the core benchmarking engine coordinating request scheduling, +data aggregation, and result compilation across execution strategies +and environments. The Benchmarker manages the complete benchmark lifecycle +from request submission through result compilation while supporting +thread-safe singleton operations for consistent state management. """ from __future__ import annotations @@ -13,24 +13,28 @@ import uuid from abc import ABC from collections.abc import AsyncIterator, Iterable -from typing import Any, Generic +from typing import Generic from guidellm.benchmark.profile import Profile from guidellm.benchmark.progress import BenchmarkerProgress from guidellm.benchmark.schemas import ( - BenchmarkerArgs, + BenchmarkAccumulatorT, + BenchmarkConfig, BenchmarkT, - EstimatedBenchmarkState, ) from guidellm.logger import logger from guidellm.scheduler import ( BackendInterface, + Constraint, Environment, + MultiTurnRequestT, RequestT, ResponseT, Scheduler, + SchedulingStrategy, ) from guidellm.utils import ThreadSafeSingletonMixin +from guidellm.utils.mixins import InfoMixin __all__ = ["Benchmarker"] @@ -43,46 +47,45 @@ class Benchmarker( """ Abstract benchmark orchestrator for request processing workflows. - Coordinates execution of benchmarking runs across different scheduling - strategies, aggregating metrics and compiling results. Manages the complete - benchmark lifecycle from request submission through result compilation while - implementing thread-safe singleton pattern to ensure consistent state across - concurrent operations. + Coordinates benchmarking runs across scheduling strategies, aggregating + metrics and compiling results. Manages the complete benchmark lifecycle + from request submission through result compilation while implementing a + thread-safe singleton pattern for consistent state across concurrent + operations. """ async def run( self, + accumulator_class: type[BenchmarkAccumulatorT], benchmark_class: type[BenchmarkT], - requests: Iterable[RequestT | Iterable[RequestT | tuple[RequestT, float]]], + requests: Iterable[RequestT | MultiTurnRequestT[RequestT]], backend: BackendInterface[RequestT, ResponseT], profile: Profile, environment: Environment, - data: list[Any], - progress: BenchmarkerProgress[BenchmarkT] | None = None, + progress: ( + BenchmarkerProgress[BenchmarkAccumulatorT, BenchmarkT] | None + ) = None, sample_requests: int | None = 20, warmup: float | None = None, cooldown: float | None = None, prefer_response_metrics: bool = True, ) -> AsyncIterator[BenchmarkT]: """ - Execute benchmark runs across multiple scheduling strategies. - - Orchestrates the complete benchmark workflow by iterating through scheduling - strategies from the profile, executing requests through the scheduler, - aggregating metrics, and compiling final benchmark results. - - :param benchmark_class: Class for constructing final benchmark objects - :param requests: Request datasets for processing across strategies - :param backend: Backend interface for request processing - :param profile: Benchmark profile defining strategies and constraints - :param environment: Execution environment for coordination - :param progress: Optional progress tracker for benchmark lifecycle events - :param sample_requests: Number of sample requests to use for estimation - :param warmup: Optional warmup duration in seconds before benchmarking - :param cooldown: Optional cooldown duration in seconds after benchmarking - :param prefer_response_metrics: Whether to prefer response-based metrics over - request-based metrics - :yield: Compiled benchmark results for each strategy execution + Execute benchmark runs across scheduling strategies defined in the profile. + + :param accumulator_class: Class for accumulating metrics during execution + :param benchmark_class: Class for constructing final benchmark results + :param requests: Request datasets to process across strategies + :param backend: Backend interface for executing requests + :param profile: Profile defining scheduling strategies and constraints + :param environment: Environment for execution coordination + :param progress: Optional tracker for benchmark lifecycle events + :param sample_requests: Number of requests to sample for estimation + :param warmup: Warmup duration in seconds before benchmarking + :param cooldown: Cooldown duration in seconds after benchmarking + :param prefer_response_metrics: Whether to prefer response metrics over + request metrics + :yield: Compiled benchmark result for each strategy execution :raises Exception: If benchmark execution or compilation fails """ with self.thread_lock: @@ -91,21 +94,38 @@ async def run( run_id = str(uuid.uuid4()) strategies_generator = profile.strategies_generator() + strategy: SchedulingStrategy | None + constraints: dict[str, Constraint] | None strategy, constraints = next(strategies_generator) while strategy is not None: if progress: await progress.on_benchmark_start(strategy) - args = BenchmarkerArgs( + config = BenchmarkConfig( run_id=run_id, run_index=len(profile.completed_strategies), + strategy=strategy, + constraints=( + { + key: InfoMixin.extract_from_obj(val) + for key, val in constraints.items() + } + if isinstance(constraints, dict) + else {"constraint": InfoMixin.extract_from_obj(constraints)} + if constraints + else {} + ), sample_requests=sample_requests, warmup=warmup, cooldown=cooldown, prefer_response_metrics=prefer_response_metrics, + profile=profile, + requests=InfoMixin.extract_from_obj(requests), + backend=InfoMixin.extract_from_obj(backend), + environment=InfoMixin.extract_from_obj(environment), ) - estimated_state = EstimatedBenchmarkState() + accumulator = accumulator_class(config=config) scheduler_state = None scheduler: Scheduler[RequestT, ResponseT] = Scheduler() @@ -123,9 +143,7 @@ async def run( **constraints or {}, ): try: - benchmark_class.update_estimate( - args, - estimated_state, + accumulator.update_estimate( response, request, request_info, @@ -133,7 +151,7 @@ async def run( ) if progress: await progress.on_benchmark_update( - estimated_state, scheduler_state + accumulator, scheduler_state ) except Exception as err: # noqa: BLE001 logger.error( @@ -141,17 +159,10 @@ async def run( ) benchmark = benchmark_class.compile( - args=args, - estimated_state=estimated_state, + accumulator=accumulator, scheduler_state=scheduler_state, - profile=profile, - requests=requests, - backend=backend, - environment=environment, - strategy=strategy, - constraints=constraints, - data=data, ) + if progress: await progress.on_benchmark_complete(benchmark) diff --git a/src/guidellm/benchmark/entrypoints.py b/src/guidellm/benchmark/entrypoints.py index e095ed12..c04c89a8 100644 --- a/src/guidellm/benchmark/entrypoints.py +++ b/src/guidellm/benchmark/entrypoints.py @@ -1,18 +1,17 @@ """ -High-level entry points for executing generative text benchmarks. - -This module provides the primary interface for running generative text benchmarks -through the `benchmark_generative_text` function and re-importing existing benchmark -reports via `reimport_benchmarks_report`. It orchestrates the initialization and -coordination of backends, data loaders, profiles, and output formats to execute -comprehensive benchmarking workflows. The module handles all resolution logic for -converting user-provided arguments into fully configured components ready for -benchmarking execution. +Primary interface for executing and re-importing generative text benchmarks. + +This module orchestrates comprehensive benchmarking workflows by coordinating backend +initialization, data loading, profile configuration, and output generation. It provides +two main entry points: `benchmark_generative_text` for executing new benchmarks and +`reimport_benchmarks_report` for re-exporting existing results. The resolution functions +convert user-provided arguments into fully configured components, handling backend +validation, data preprocessing, profile constraints, and output format specifications. """ from __future__ import annotations -from collections.abc import Callable +from collections.abc import Callable, Mapping, MutableMapping from pathlib import Path from typing import Any, Literal @@ -22,12 +21,13 @@ from guidellm.backends import Backend, BackendType from guidellm.benchmark.benchmarker import Benchmarker -from guidellm.benchmark.output import GenerativeBenchmarkerOutput +from guidellm.benchmark.outputs import GenerativeBenchmarkerOutput from guidellm.benchmark.profile import Profile, ProfileType from guidellm.benchmark.progress import GenerativeConsoleBenchmarkerProgress from guidellm.benchmark.schemas import ( BenchmarkGenerativeTextArgs, GenerativeBenchmark, + GenerativeBenchmarkAccumulator, GenerativeBenchmarksReport, ) from guidellm.data import ( @@ -36,6 +36,7 @@ GenerativeRequestCollator, PreprocessorRegistry, ProcessorFactory, + RequestFormatter, ) from guidellm.data.preprocessors import GenerativeColumnMapper from guidellm.scheduler import ( @@ -44,6 +45,7 @@ StrategyType, ) from guidellm.schemas import GenerationRequest, GenerationResponse +from guidellm.settings import settings from guidellm.utils import Console, InfoMixin __all__ = [ @@ -52,17 +54,22 @@ ] -# Helper Functions +# Type Aliases OutputFormatT = TypeAliasType( "OutputFormatT", tuple[str, ...] | list[str] - | dict[str, str | dict[str, Any] | GenerativeBenchmarkerOutput] + | Mapping[str, str | dict[str, Any] | GenerativeBenchmarkerOutput] | None, ) +"""Output format specification as strings, mappings, or configured output instances""" ProcessorInputT = TypeAliasType("ProcessorInputT", str | Path | PreTrainedTokenizerBase) +"""Processor input as model identifier, path to tokenizer, or tokenizer instance""" + + +# Helper Functions async def resolve_backend( @@ -71,9 +78,14 @@ async def resolve_backend( model: str | None, console: Console | None = None, **backend_kwargs: dict[str, Any], -) -> tuple[Backend, str | None]: +) -> tuple[Backend, str]: """ - Initialize and validate a backend instance for benchmarking. + Initialize and validate a backend instance for benchmarking execution. + + Handles backend creation from type identifiers or pre-configured instances, + performs startup validation, and resolves the default model if not specified. + The backend is shut down after validation to ensure clean state for subsequent + benchmark execution. :param backend: Backend type identifier or pre-configured Backend instance :param target: Target endpoint URL or connection string for the backend @@ -87,17 +99,19 @@ async def resolve_backend( if console else None ) - backend = ( + backend_instance = ( Backend.create(backend, target=target, model=model, **(backend_kwargs or {})) if not isinstance(backend, Backend) else backend ) if console_step: - console_step.update(f"{backend.__class__.__name__} backend initialized") + console_step.update( + f"{backend_instance.__class__.__name__} backend initialized" + ) - await backend.process_startup() - await backend.validate() + await backend_instance.process_startup() + await backend_instance.validate() if model is None: if console_step: @@ -105,20 +119,21 @@ async def resolve_backend( title="Resolving default model from backend.default_model", status_level="info", ) - model = await backend.default_model() + model = await backend_instance.default_model() - await backend.process_shutdown() + await backend_instance.process_shutdown() if console_step: console_step.finish( title=( - f"{backend.__class__.__name__} backend validated with model {model}" + f"{backend_instance.__class__.__name__} backend validated " + f"with model {model}" ), - details=backend.info, + details=backend_instance.info, status_level="success", ) - return backend, model + return backend_instance, model async def resolve_processor( @@ -127,7 +142,7 @@ async def resolve_processor( console: Console | None = None, ) -> ProcessorInputT | None: """ - Resolve the processor for tokenization, defaulting to model if not provided. + Resolve the tokenization processor, defaulting to model if not provided. :param processor: Processor identifier, path, tokenizer instance, or None :param model: Model identifier to use as fallback processor @@ -161,15 +176,17 @@ async def resolve_processor( async def resolve_request_loader( data: list[Any], - model: str | None, + model: str, data_args: list[dict[str, Any]] | None, data_samples: int, processor: ProcessorInputT | None, processor_args: dict[str, Any] | None, data_column_mapper: ( - DatasetPreprocessor | dict[str, str] | Literal["generative_column_mapper"] + DatasetPreprocessor + | dict[str, str | list[str]] + | Literal["generative_column_mapper"] ), - data_request_formatter: (DatasetPreprocessor | dict[str, str] | str), + data_request_formatter: (RequestFormatter | dict[str, str] | str), data_collator: Callable | Literal["generative"] | None, data_sampler: Sampler[int] | Literal["shuffle"] | None, data_num_workers: int | None, @@ -180,6 +197,11 @@ async def resolve_request_loader( """ Construct a DataLoader for GenerationRequest objects from raw data inputs. + Initializes and configures the data pipeline including column mapping, request + formatting, collation, and sampling. Resolves string-based preprocessor identifiers + from the PreprocessorRegistry and creates appropriate instances with provided + configurations. + :param data: List of data sources to load requests from :param model: Model identifier for request formatting :param data_args: Arguments for each data source in the data list @@ -195,6 +217,10 @@ async def resolve_request_loader( :param console: Console instance for progress reporting, or None :param dataloader_kwargs: Additional arguments passed to DataLoader initialization :return: Configured DataLoader instance for GenerationRequest objects + :raises ValueError: If request formatter type is not registered in + PreprocessorRegistry + :raises TypeError: If registered request formatter is not a RequestFormatter + subclass """ console_step = ( console.print_update_step(title=f"Initializing request loader from {data}") @@ -202,38 +228,63 @@ async def resolve_request_loader( else None ) - if not isinstance(data_column_mapper, DatasetPreprocessor): + data_column_mapper_instance: DatasetPreprocessor + if isinstance(data_column_mapper, DatasetPreprocessor): + data_column_mapper_instance = data_column_mapper + else: column_mappings = ( data_column_mapper if isinstance(data_column_mapper, dict) else None ) - data_column_mapper = GenerativeColumnMapper( - column_mappings=column_mappings, + data_column_mapper_instance = GenerativeColumnMapper( + column_mappings=column_mappings # type: ignore[arg-type] ) - if not isinstance(data_request_formatter, DatasetPreprocessor): - request_type = ( - data_request_formatter - if isinstance(data_request_formatter, str) - else data_request_formatter.pop("request_type", "chat_completions") - ) - data_request_formatter = PreprocessorRegistry.get_registered_object( - request_type - )( + + data_request_formatter_instance: RequestFormatter + if isinstance(data_request_formatter, RequestFormatter): + data_request_formatter_instance = data_request_formatter + else: + if isinstance(data_request_formatter, str): + request_type = data_request_formatter + formatter_kwargs: dict[str, Any] = {} + else: + # Extract request_type from formatter dictionary + formatter_dict = dict(data_request_formatter) + request_type = formatter_dict.pop("request_type", settings.preferred_route) + formatter_kwargs = formatter_dict + + if ( + formatter_class := PreprocessorRegistry.get_registered_object(request_type) + ) is None: + raise ValueError( + f"Request formatter '{request_type}' is not registered in the " + f"PreprocessorRegistry." + ) + if not issubclass(formatter_class, RequestFormatter): + raise TypeError( + f"Request formatter '{request_type}' is not a subclass of " + f"RequestFormatter." + ) + + data_request_formatter_instance = formatter_class( model=model, - **( - data_request_formatter - if isinstance(data_request_formatter, dict) - else {} - ), + **formatter_kwargs, ) - request_loader = DataLoader( + # Cast to proper types for the DataLoader preprocessors list + preprocessors_list: list[DatasetPreprocessor] = [ + data_column_mapper_instance, + data_request_formatter_instance, + ] + + request_loader: DataLoader[GenerationRequest] = DataLoader( data=data, data_args=data_args, data_samples=data_samples, processor_factory=ProcessorFactory( - processor=processor, processor_args=processor_args + processor=processor if processor is not None else model, + processor_args=processor_args, ), - preprocessors=[data_column_mapper, data_request_formatter], + preprocessors=preprocessors_list, collator=( data_collator if callable(data_collator) else GenerativeRequestCollator() ), @@ -259,9 +310,9 @@ async def resolve_request_loader( async def resolve_profile( profile: StrategyType | ProfileType | Profile, - rate: float | list[float] | None, + rate: list[float] | None, random_seed: int, - constraints: dict[str, ConstraintInitializer | Any], + constraints: MutableMapping[str, ConstraintInitializer | Any], max_seconds: int | float | None, max_requests: int | None, max_errors: int | None, @@ -272,6 +323,10 @@ async def resolve_profile( """ Resolve and configure a benchmark profile with rate and constraint settings. + Constructs a Profile instance from type identifiers or validates pre-configured + profiles. Constraint parameters are merged into the constraints dictionary before + profile creation. + :param profile: Profile type identifier or pre-configured Profile instance :param rate: Request rate(s) for the benchmark execution :param random_seed: Seed for reproducible random operations @@ -361,20 +416,22 @@ async def benchmark_generative_text( args: BenchmarkGenerativeTextArgs, progress: GenerativeConsoleBenchmarkerProgress | None = None, console: Console | None = None, - **constraints: dict[str, ConstraintInitializer | Any], + **constraints: str | ConstraintInitializer | Any, ) -> tuple[GenerativeBenchmarksReport, dict[str, Any]]: """ Execute a comprehensive generative text benchmarking workflow. - Orchestrates the full benchmarking pipeline by resolving all components (backend, - data loader, profile, outputs) from provided arguments, executing the benchmark - runs, and finalizing results in the specified output formats. + Orchestrates the full benchmarking pipeline by resolving all components from + provided arguments, executing benchmark runs across configured profiles, and + finalizing results in specified output formats. Components include backend + initialization, data loading, profile configuration, and output generation. :param args: Configuration arguments for the benchmark execution :param progress: Progress tracker for benchmark execution, or None for no tracking :param console: Console instance for status reporting, or None for silent operation :param constraints: Additional constraint initializers for benchmark limits - :return: Tuple of GenerativeBenchmarksReport and dictionary of output format results + :return: Tuple of GenerativeBenchmarksReport and dictionary of output format + results """ backend, model = await resolve_backend( backend=args.backend, @@ -431,12 +488,12 @@ async def benchmark_generative_text( GenerativeBenchmark, GenerationRequest, GenerationResponse ] = Benchmarker() async for benchmark in benchmarker.run( - benchmark_class=args.benchmark_cls, + accumulator_class=GenerativeBenchmarkAccumulator, + benchmark_class=GenerativeBenchmark, requests=request_loader, backend=backend, profile=profile, environment=NonDistributedEnvironment(), - data=args.data, progress=progress, sample_requests=args.sample_requests, warmup=args.warmup, @@ -472,12 +529,13 @@ async def reimport_benchmarks_report( output_formats: OutputFormatT = ("console", "json", "html", "csv"), ) -> tuple[GenerativeBenchmarksReport, dict[str, Any]]: """ - Load and re-export an existing benchmarks report in specified formats. + Load and re-export an existing benchmarks report in specified output formats. :param file: Path to the existing benchmark report file to load :param output_path: Base path for output file generation, or None for default :param output_formats: Specification of desired output formats for the report - :return: Tuple of loaded GenerativeBenchmarksReport and dictionary of output results + :return: Tuple of loaded GenerativeBenchmarksReport and dictionary of output + results """ console = Console() @@ -490,11 +548,11 @@ async def reimport_benchmarks_report( f" loaded {len(report.benchmarks)} benchmark(s)" ) - output_formats = await resolve_output_formats( + resolved_output_formats = await resolve_output_formats( output_formats, output_path, console=console ) output_format_results = {} - for key, output in output_formats.items(): + for key, output in resolved_output_formats.items(): output_result = await output.finalize(report) output_format_results[key] = output_result diff --git a/src/guidellm/benchmark/output.py b/src/guidellm/benchmark/output.py deleted file mode 100644 index 6e17de5b..00000000 --- a/src/guidellm/benchmark/output.py +++ /dev/null @@ -1,745 +0,0 @@ -from __future__ import annotations - -import csv -import json -import math -from abc import ABC, abstractmethod -from collections import OrderedDict -from copy import deepcopy -from datetime import datetime -from pathlib import Path -from typing import Any, ClassVar - -from pydantic import BaseModel, ConfigDict, Field -from rich.console import Console -from rich.padding import Padding -from rich.text import Text - -from guidellm.benchmark.profile import ( - AsyncProfile, - ConcurrentProfile, - SweepProfile, - ThroughputProfile, -) -from guidellm.benchmark.schemas import ( - GenerativeBenchmark, - GenerativeBenchmarksReport, - GenerativeMetrics, -) -from guidellm.presentation import UIDataBuilder -from guidellm.presentation.injector import create_report -from guidellm.settings import settings -from guidellm.utils import ( - Colors, - DistributionSummary, - RegistryMixin, - StatusDistributionSummary, - camelize_str, - recursive_key_update, - safe_format_timestamp, - split_text_list_by_length, -) - -__all__ = [ - "GenerativeBenchmarkerCSV", - "GenerativeBenchmarkerConsole", - "GenerativeBenchmarkerHTML", - "GenerativeBenchmarkerOutput", -] - - -class GenerativeBenchmarkerOutput( - BaseModel, RegistryMixin[type["GenerativeBenchmarkerOutput"]], ABC -): - model_config = ConfigDict( - extra="ignore", - arbitrary_types_allowed=True, - validate_assignment=True, - from_attributes=True, - use_enum_values=True, - ) - - @classmethod - @abstractmethod - def validated_kwargs(cls, *args, **kwargs) -> dict[str, Any]: - """ - Validate and process arguments for constraint creation. - - Must be implemented by subclasses to handle their specific parameter patterns. - - :param args: Positional arguments passed to the constraint - :param kwargs: Keyword arguments passed to the constraint - :return: Validated dictionary of parameters for constraint creation - :raises NotImplementedError: Must be implemented by subclasses - """ - ... - - @classmethod - def resolve( - cls, - output_formats: ( - tuple[str, ...] - | list[str] - | dict[ - str, - Any | dict[str, Any] | GenerativeBenchmarkerOutput, - ] - | None - ), - output_path: str | Path | None, - ) -> dict[str, GenerativeBenchmarkerOutput]: - if not output_formats: - return {} - - if isinstance(output_formats, list | tuple): - # support list of output keys: ["csv", "json"] - # support list of files: ["path/to/file.json", "path/to/file.csv"] - formats_list = output_formats - output_formats = {} - for output_format in formats_list: - if not isinstance(output_format, str): - raise TypeError( - f"Expected string format, got {type(output_format)} for " - f"{output_format} in {formats_list}" - ) - try: - if cls.is_registered(output_format): - output_formats[output_format] = {} - else: - # treat it as a file save location - path = Path(output_format) - format_type = path.suffix[1:].lower() - output_formats[format_type] = {"output_path": path} - - except Exception as err: - raise ValueError( - f"Failed to resolve output format '{output_format}': {err}" - ) from err - - resolved = {} - - for key, val in output_formats.items(): - if isinstance(val, GenerativeBenchmarkerOutput): - resolved[key] = val - else: - output_class = cls.get_registered_object(key) - kwargs = {"output_path": output_path} - - if isinstance(val, dict): - kwargs.update(val) - kwargs = output_class.validated_kwargs(**kwargs) - else: - kwargs = output_class.validated_kwargs(val, **kwargs) - - resolved[key] = output_class(**kwargs) - - return resolved - - @abstractmethod - async def finalize(self, report: GenerativeBenchmarksReport) -> Any: ... - - -@GenerativeBenchmarkerOutput.register(["json", "yaml"]) -class GenerativeBenchmarkerSerialized(GenerativeBenchmarkerOutput): - @classmethod - def validated_kwargs( - cls, output_path: str | Path | None, **_kwargs - ) -> dict[str, Any]: - new_kwargs = {} - if output_path is not None: - new_kwargs["output_path"] = ( - Path(output_path) if not isinstance(output_path, Path) else output_path - ) - return new_kwargs - - output_path: Path = Field(default_factory=lambda: Path.cwd()) - - async def finalize(self, report: GenerativeBenchmarksReport) -> Path: - return report.save_file(self.output_path) - - -@GenerativeBenchmarkerOutput.register("console") -class GenerativeBenchmarkerConsole(GenerativeBenchmarkerOutput): - """Console output formatter for benchmark results with rich formatting.""" - - @classmethod - def validated_kwargs(cls, *_args, **_kwargs) -> dict[str, Any]: - return {} - - console: Console = Field(default_factory=Console) - - async def finalize(self, report: GenerativeBenchmarksReport) -> str: - """ - Print the complete benchmark report to the console. - - :param report: The completed benchmark report. - :return: - """ - self._print_benchmarks_metadata(report.benchmarks) - self._print_benchmarks_info(report.benchmarks) - self._print_benchmarks_stats(report.benchmarks) - - return "printed to console" - - def _print_benchmarks_metadata(self, benchmarks: list[GenerativeBenchmark]): - start_time = benchmarks[0].run_stats.start_time - end_time = benchmarks[-1].run_stats.end_time - duration = end_time - start_time - - self._print_section_header("Benchmarks Metadata") - self._print_labeled_line("Run id", str(benchmarks[0].run_id)) - self._print_labeled_line("Duration", f"{duration:.1f} seconds") - self._print_labeled_line("Profile", self._get_profile_str(benchmarks[0])) - - def _print_benchmarks_info(self, benchmarks: list[GenerativeBenchmark]): - sections = { - "Metadata": (0, 3), - "Requests Made": (4, 6), - "Prompt Tok/Req": (7, 9), - "Output Tok/Req": (10, 12), - "Prompt Tok Total": (13, 15), - "Output Tok Total": (16, 18), - } - headers = [ - "Benchmark", - "Start Time", - "End Time", - "Duration (s)", - "Comp", - "Inc", - "Err", - "Comp", - "Inc", - "Err", - "Comp", - "Inc", - "Err", - "Comp", - "Inc", - "Err", - "Comp", - "Inc", - "Err", - ] - - rows = [] - for benchmark in benchmarks: - rows.append( - [ - str(benchmark.scheduler.strategy), - safe_format_timestamp(benchmark.start_time), - safe_format_timestamp(benchmark.end_time), - f"{(benchmark.end_time - benchmark.start_time):.1f}", - f"{benchmark.request_totals.successful:.0f}", - f"{benchmark.request_totals.incomplete:.0f}", - f"{benchmark.request_totals.errored:.0f}", - f"{benchmark.metrics.prompt_token_count.successful.mean:.1f}", - f"{benchmark.metrics.prompt_token_count.incomplete.mean:.1f}", - f"{benchmark.metrics.prompt_token_count.errored.mean:.1f}", - f"{benchmark.metrics.output_token_count.successful.mean:.1f}", - f"{benchmark.metrics.output_token_count.incomplete.mean:.1f}", - f"{benchmark.metrics.output_token_count.errored.mean:.1f}", - f"{benchmark.metrics.prompt_token_count.successful.total_sum:.0f}", - f"{benchmark.metrics.prompt_token_count.incomplete.total_sum:.0f}", - f"{benchmark.metrics.prompt_token_count.errored.total_sum:.0f}", - f"{benchmark.metrics.output_token_count.successful.total_sum:.0f}", - f"{benchmark.metrics.output_token_count.incomplete.total_sum:.0f}", - f"{benchmark.metrics.output_token_count.errored.total_sum:.0f}", - ] - ) - - self._print_table(headers, rows, "Benchmarks Info", sections) - - def _print_benchmarks_stats(self, benchmarks: list[GenerativeBenchmark]): - sections = { - "Metadata": (0, 0), - "Request Stats": (1, 2), - "Out Tok/sec": (3, 3), - "Tot Tok/sec": (4, 4), - "Req Latency (sec)": (5, 7), - "TTFT (ms)": (8, 10), - "ITL (ms)": (11, 13), - "TPOT (ms)": (14, 16), - } - headers = [ - "Benchmark", - "Per Second", - "Concurrency", - "mean", - "mean", - "mean", - "median", - "p99", - "mean", - "median", - "p99", - "mean", - "median", - "p99", - "mean", - "median", - "p99", - ] - - rows = [] - for benchmark in benchmarks: - rows.append( - [ - str(benchmark.scheduler.strategy), - f"{benchmark.metrics.requests_per_second.successful.mean:.2f}", - f"{benchmark.metrics.request_concurrency.successful.mean:.2f}", - f"{benchmark.metrics.output_tokens_per_second.successful.mean:.1f}", - f"{benchmark.metrics.tokens_per_second.successful.mean:.1f}", - f"{benchmark.metrics.request_latency.successful.mean:.2f}", - f"{benchmark.metrics.request_latency.successful.median:.2f}", - f"{benchmark.metrics.request_latency.successful.percentiles.p99:.2f}", - f"{benchmark.metrics.time_to_first_token_ms.successful.mean:.1f}", - f"{benchmark.metrics.time_to_first_token_ms.successful.median:.1f}", - f"{benchmark.metrics.time_to_first_token_ms.successful.percentiles.p99:.1f}", - f"{benchmark.metrics.inter_token_latency_ms.successful.mean:.1f}", - f"{benchmark.metrics.inter_token_latency_ms.successful.median:.1f}", - f"{benchmark.metrics.inter_token_latency_ms.successful.percentiles.p99:.1f}", - f"{benchmark.metrics.time_per_output_token_ms.successful.mean:.1f}", - f"{benchmark.metrics.time_per_output_token_ms.successful.median:.1f}", - f"{benchmark.metrics.time_per_output_token_ms.successful.percentiles.p99:.1f}", - ] - ) - - self._print_table(headers, rows, "Benchmarks Stats", sections) - - def _get_profile_str(self, benchmark: GenerativeBenchmark) -> str: - profile = benchmark.benchmarker.profile - if profile is None: - return "None" - - profile_args = OrderedDict( - { - "type": profile.type_, - "strategies": getattr(profile, "strategy_types", []), - } - ) - - if isinstance(profile, ConcurrentProfile): - profile_args["streams"] = str(profile.streams) - elif isinstance(profile, ThroughputProfile): - profile_args["max_concurrency"] = str(profile.max_concurrency) - elif isinstance(profile, AsyncProfile): - profile_args["max_concurrency"] = str(profile.max_concurrency) - profile_args["rate"] = str(profile.rate) - elif isinstance(profile, SweepProfile): - profile_args["sweep_size"] = str(profile.sweep_size) - - return ", ".join(f"{key}={value}" for key, value in profile_args.items()) - - def _print_section_header(self, title: str, indent: int = 0, new_lines: int = 2): - self._print_line( - f"{title}:", - f"bold underline {Colors.info}", - indent=indent, - new_lines=new_lines, - ) - - def _print_labeled_line( - self, label: str, value: str, indent: int = 4, new_lines: int = 0 - ): - self._print_line( - [label + ":", value], - ["bold " + Colors.info, "italic"], - new_lines=new_lines, - indent=indent, - ) - - def _print_line( - self, - value: str | list[str], - style: str | list[str] = "", - indent: int = 0, - new_lines: int = 0, - ): - text = Text() - for _ in range(new_lines): - text.append("\n") - - if not isinstance(value, list): - value = [value] - if not isinstance(style, list): - style = [style for _ in range(len(value))] - - if len(value) != len(style): - raise ValueError( - f"Value and style length mismatch: {len(value)} vs {len(style)}" - ) - - for val, sty in zip(value, style, strict=False): - text.append(val, style=sty) - - self.console.print(Padding.indent(text, indent)) - - def _print_table( - self, - headers: list[str], - rows: list[list[Any]], - title: str, - sections: dict[str, tuple[int, int]] | None = None, - max_char_per_col: int = 1024, - indent: int = 0, - new_lines: int = 2, - ): - if rows and any(len(row) != len(headers) for row in rows): - raise ValueError( - "Headers and rows length mismatch: " - f"{len(headers)} vs {len(rows[0]) if rows else 'N/A'}" - ) - - max_chars_per_column = self._calculate_max_chars_per_column( - headers, rows, sections, max_char_per_col - ) - - self._print_section_header(title, indent=indent, new_lines=new_lines) - self._print_table_divider(max_chars_per_column, False, indent) - if sections: - self._print_table_sections(sections, max_chars_per_column, indent) - self._print_table_row( - split_text_list_by_length(headers, max_chars_per_column), - f"bold {Colors.info}", - indent, - ) - self._print_table_divider(max_chars_per_column, True, indent) - for row in rows: - self._print_table_row( - split_text_list_by_length(row, max_chars_per_column), - "italic", - indent, - ) - self._print_table_divider(max_chars_per_column, False, indent) - - def _calculate_max_chars_per_column( - self, - headers: list[str], - rows: list[list[Any]], - sections: dict[str, tuple[int, int]] | None, - max_char_per_col: int, - ) -> list[int]: - """Calculate maximum characters per column for table formatting.""" - max_chars_per_column = [] - for ind in range(len(headers)): - max_chars_per_column.append(min(len(headers[ind]), max_char_per_col)) - for row in rows: - max_chars_per_column[ind] = max( - max_chars_per_column[ind], len(str(row[ind])) - ) - - if not sections: - return max_chars_per_column - - for section, (start_col, end_col) in sections.items(): - min_section_len = len(section) + (end_col - start_col) - chars_in_columns = sum( - max_chars_per_column[start_col : end_col + 1] - ) + 2 * (end_col - start_col) - if min_section_len > chars_in_columns: - add_chars_per_col = math.ceil( - (min_section_len - chars_in_columns) / (end_col - start_col + 1) - ) - for col in range(start_col, end_col + 1): - max_chars_per_column[col] += add_chars_per_col - - return max_chars_per_column - - def _print_table_divider( - self, max_chars_per_column: list[int], include_separators: bool, indent: int = 0 - ): - """Print table divider line.""" - if include_separators: - columns = [ - settings.table_headers_border_char * max_chars - + settings.table_column_separator_char - + settings.table_headers_border_char - for max_chars in max_chars_per_column - ] - else: - columns = [ - settings.table_border_char * (max_chars + 2) - for max_chars in max_chars_per_column - ] - columns[-1] = columns[-1][:-2] - self._print_line(columns, Colors.info, indent) - - def _print_table_sections( - self, - sections: dict[str, tuple[int, int]], - max_chars_per_column: list[int], - indent: int = 0, - ): - section_tuples = [(start, end, name) for name, (start, end) in sections.items()] - section_tuples.sort(key=lambda x: x[0]) - - if any(start > end for start, end, _ in section_tuples): - raise ValueError(f"Invalid section ranges: {section_tuples}") - - if ( - any( - section_tuples[ind][1] + 1 != section_tuples[ind + 1][0] - for ind in range(len(section_tuples) - 1) - ) - or section_tuples[0][0] != 0 - or section_tuples[-1][1] != len(max_chars_per_column) - 1 - ): - raise ValueError(f"Invalid section ranges: {section_tuples}") - - line_values = [] - line_styles = [] - for section, (start_col, end_col) in sections.items(): - section_length = sum(max_chars_per_column[start_col : end_col + 1]) + 2 * ( - end_col - start_col + 1 - ) - num_separators = end_col - start_col - line_values.extend( - [ - section, - " " * (section_length - len(section) - num_separators - 2), - settings.table_column_separator_char * num_separators, - settings.table_column_separator_char + " ", - ] - ) - line_styles.extend(["bold " + Colors.info, "", "", Colors.info]) - - line_values = line_values[:-1] - line_styles = line_styles[:-1] - self._print_line(line_values, line_styles, indent) - - def _print_table_row( - self, column_lines: list[list[str]], style: str, indent: int = 0 - ): - for row in range(len(column_lines[0])): - print_line = [] - print_styles = [] - for column in range(len(column_lines)): - print_line.extend( - [ - column_lines[column][row], - settings.table_column_separator_char, - " ", - ] - ) - print_styles.extend([style, Colors.info, ""]) - print_line = print_line[:-2] - print_styles = print_styles[:-2] - self._print_line(print_line, print_styles, indent) - - -@GenerativeBenchmarkerOutput.register("csv") -class GenerativeBenchmarkerCSV(GenerativeBenchmarkerOutput): - """CSV output formatter for benchmark results.""" - - DEFAULT_FILE: ClassVar[str] = "benchmarks.csv" - - @classmethod - def validated_kwargs( - cls, output_path: str | Path | None, **_kwargs - ) -> dict[str, Any]: - new_kwargs = {} - if output_path is not None: - new_kwargs["output_path"] = ( - Path(output_path) if not isinstance(output_path, Path) else output_path - ) - return new_kwargs - - output_path: Path = Field(default_factory=lambda: Path.cwd()) - - async def finalize(self, report: GenerativeBenchmarksReport) -> Path: - """ - Save the benchmark report as a CSV file. - - :param report: The completed benchmark report. - :return: Path to the saved CSV file. - """ - output_path = self.output_path - if output_path.is_dir(): - output_path = output_path / GenerativeBenchmarkerCSV.DEFAULT_FILE - output_path.parent.mkdir(parents=True, exist_ok=True) - - with output_path.open("w", newline="") as file: - writer = csv.writer(file) - headers: list[str] = [] - rows: list[list[str | float | list[float]]] = [] - - for benchmark in report.benchmarks: - benchmark_headers: list[str] = [] - benchmark_values: list[str | float | list[float]] = [] - - # Add basic run description info - desc_headers, desc_values = self._get_benchmark_desc_headers_and_values( - benchmark - ) - benchmark_headers.extend(desc_headers) - benchmark_values.extend(desc_values) - - # Add status-based metrics - for status in StatusDistributionSummary.model_fields: - status_headers, status_values = ( - self._get_benchmark_status_headers_and_values(benchmark, status) - ) - benchmark_headers.extend(status_headers) - benchmark_values.extend(status_values) - - # Add extra fields - extras_headers, extras_values = ( - self._get_benchmark_extras_headers_and_values(benchmark) - ) - benchmark_headers.extend(extras_headers) - benchmark_values.extend(extras_values) - - if not headers: - headers = benchmark_headers - rows.append(benchmark_values) - - writer.writerow(headers) - for row in rows: - writer.writerow(row) - - return output_path - - def _get_benchmark_desc_headers_and_values( - self, benchmark: GenerativeBenchmark - ) -> tuple[list[str], list[str | float]]: - """Get description headers and values for a benchmark.""" - headers = [ - "Type", - "Run Id", - "Id", - "Name", - "Start Time", - "End Time", - "Duration", - ] - values: list[str | float] = [ - benchmark.type_, - benchmark.run_id, - benchmark.id_, - str(benchmark.scheduler.strategy), - datetime.fromtimestamp(benchmark.start_time).strftime("%Y-%m-%d %H:%M:%S"), - datetime.fromtimestamp(benchmark.end_time).strftime("%Y-%m-%d %H:%M:%S"), - benchmark.duration, - ] - return headers, values - - def _get_benchmark_status_headers_and_values( - self, benchmark: GenerativeBenchmark, status: str - ) -> tuple[list[str], list[float | list[float]]]: - """Get status-based metrics headers and values for a benchmark.""" - headers = [f"{status.capitalize()} Requests"] - values = [getattr(benchmark.request_totals, status)] - - for metric in GenerativeMetrics.model_fields: - metric_headers, metric_values = self._get_benchmark_status_metrics_stats( - benchmark, status, metric - ) - headers.extend(metric_headers) - values.extend(metric_values) - - return headers, values - - def _get_benchmark_status_metrics_stats( - self, benchmark: GenerativeBenchmark, status: str, metric: str - ) -> tuple[list[str], list[float | list[float]]]: - """Get statistical metrics for a specific status and metric.""" - status_display = status.capitalize() - metric_display = metric.replace("_", " ").capitalize() - status_dist_summary: StatusDistributionSummary = getattr( - benchmark.metrics, metric - ) - if not hasattr(status_dist_summary, status): - return [], [] - dist_summary: DistributionSummary = getattr(status_dist_summary, status) - - headers = [ - f"{status_display} {metric_display} mean", - f"{status_display} {metric_display} median", - f"{status_display} {metric_display} std dev", - ( - f"{status_display} {metric_display} " - "[min, 0.1, 1, 5, 10, 25, 75, 90, 95, 99, max]" - ), - ] - values: list[float | list[float]] = [ - dist_summary.mean, - dist_summary.median, - dist_summary.std_dev, - [ - dist_summary.min, - dist_summary.percentiles.p001, - dist_summary.percentiles.p01, - dist_summary.percentiles.p05, - dist_summary.percentiles.p10, - dist_summary.percentiles.p25, - dist_summary.percentiles.p75, - dist_summary.percentiles.p90, - dist_summary.percentiles.p95, - dist_summary.percentiles.p99, - dist_summary.max, - ], - ] - return headers, values - - def _get_benchmark_extras_headers_and_values( - self, - benchmark: GenerativeBenchmark, - ) -> tuple[list[str], list[str]]: - headers = ["Profile", "Backend", "Generator Data"] - values: list[str] = [ - benchmark.benchmarker.profile.model_dump_json(), - json.dumps(benchmark.benchmarker.backend), - json.dumps(benchmark.benchmarker.requests["data"]), - ] - - if len(headers) != len(values): - raise ValueError("Headers and values length mismatch.") - - return headers, values - - -@GenerativeBenchmarkerOutput.register("html") -class GenerativeBenchmarkerHTML(GenerativeBenchmarkerOutput): - """HTML output formatter for benchmark results.""" - - DEFAULT_FILE: ClassVar[str] = "benchmarks.html" - - @classmethod - def validated_kwargs( - cls, output_path: str | Path | None, **_kwargs - ) -> dict[str, Any]: - new_kwargs = {} - if output_path is not None: - new_kwargs["output_path"] = ( - Path(output_path) if not isinstance(output_path, Path) else output_path - ) - return new_kwargs - - output_path: Path = Field(default_factory=lambda: Path.cwd()) - - async def finalize(self, report: GenerativeBenchmarksReport) -> Path: - """ - Save the benchmark report as an HTML file. - - :param report: The completed benchmark report. - :return: Path to the saved HTML file. - """ - output_path = self.output_path - if output_path.is_dir(): - output_path = output_path / GenerativeBenchmarkerHTML.DEFAULT_FILE - output_path.parent.mkdir(parents=True, exist_ok=True) - - data_builder = UIDataBuilder(report.benchmarks) - data = data_builder.to_dict() - camel_data = recursive_key_update(deepcopy(data), camelize_str) - - ui_api_data = {} - for k, v in camel_data.items(): - placeholder_key = f"window.{k} = {{}};" - replacement_value = f"window.{k} = {json.dumps(v, indent=2)};\n" - ui_api_data[placeholder_key] = replacement_value - - create_report(ui_api_data, output_path) - - return output_path diff --git a/src/guidellm/benchmark/outputs/__init__.py b/src/guidellm/benchmark/outputs/__init__.py new file mode 100644 index 00000000..2e321605 --- /dev/null +++ b/src/guidellm/benchmark/outputs/__init__.py @@ -0,0 +1,24 @@ +""" +Output formatters for benchmark results. + +Provides output formatter implementations that transform benchmark reports into +various file formats including JSON, CSV, HTML, and console display. All formatters +extend the base GenerativeBenchmarkerOutput interface, enabling dynamic resolution +and flexible output configuration for benchmark result persistence and analysis. +""" + +from __future__ import annotations + +from .console import GenerativeBenchmarkerConsole +from .csv import GenerativeBenchmarkerCSV +from .html import GenerativeBenchmarkerHTML +from .output import GenerativeBenchmarkerOutput +from .serialized import GenerativeBenchmarkerSerialized + +__all__ = [ + "GenerativeBenchmarkerCSV", + "GenerativeBenchmarkerConsole", + "GenerativeBenchmarkerHTML", + "GenerativeBenchmarkerOutput", + "GenerativeBenchmarkerSerialized", +] diff --git a/src/guidellm/benchmark/outputs/console.py b/src/guidellm/benchmark/outputs/console.py new file mode 100644 index 00000000..2dc8ce3c --- /dev/null +++ b/src/guidellm/benchmark/outputs/console.py @@ -0,0 +1,620 @@ +""" +Console output formatter for generative benchmarker results. + +This module provides console-based output formatting for benchmark reports, organizing +metrics into structured tables that display request statistics, latency measurements, +throughput data, and modality-specific metrics (text, image, video, audio). It uses +the Console utility to render multi-column tables with proper alignment and formatting +for terminal display. +""" + +from __future__ import annotations + +from collections import defaultdict +from dataclasses import dataclass, field +from typing import Any, Literal, cast + +from pydantic import Field + +from guidellm.benchmark.outputs.output import GenerativeBenchmarkerOutput +from guidellm.benchmark.schemas import GenerativeBenchmarksReport +from guidellm.schemas import DistributionSummary, StatusDistributionSummary +from guidellm.utils import Console, safe_format_number, safe_format_timestamp + +__all__ = ["GenerativeBenchmarkerConsole"] + + +@dataclass +class ConsoleTableColumn: + """ + Data structure for a single console table column. + + Stores column metadata (group, name, units, type) and accumulated values for + rendering formatted table output with proper type-specific formatting and precision. + + :cvar group: Optional group header for related columns + :cvar name: Column name displayed in header + :cvar units: Optional unit label for numeric values + :cvar type_: Data type determining formatting (number, text, timestamp) + :cvar precision: Decimal precision for numeric formatting + :cvar values: Accumulated values for this column across rows + """ + + group: str | None = None + name: str | None = None + units: str | None = None + type_: Literal["number", "text", "timestamp"] = "number" + precision: int = 1 + values: list[str | float | int | None] = field(default_factory=list) + + +class ConsoleTableColumnsCollection(dict[str, ConsoleTableColumn]): + """ + Collection manager for console table columns. + + Extends dict to provide specialized methods for adding values and statistics to + columns, automatically creating columns as needed and organizing them by composite + keys for consistent table rendering. + """ + + def add_value( + self, + value: str | float | int | None, + group: str | None = None, + name: str | None = None, + units: str | None = None, + type_: Literal["number", "text", "timestamp"] = "number", + precision: int = 1, + ): + """ + Add a value to a column, creating the column if it doesn't exist. + + :param value: The value to add to the column + :param group: Optional group header for the column + :param name: Column name for display + :param units: Optional unit label + :param type_: Data type for formatting + :param precision: Decimal precision for numbers + """ + key = f"{group}_{name}_{units}" + + if key not in self: + self[key] = ConsoleTableColumn( + group=group, name=name, units=units, type_=type_, precision=precision + ) + + self[key].values.append(value) + + def add_stats( + self, + stats: StatusDistributionSummary | None, + status: Literal["successful", "incomplete", "errored", "total"] = "successful", + group: str | None = None, + name: str | None = None, + precision: int = 1, + ): + """ + Add statistical summary columns (mean and p95) for a metric. + + Creates paired mean/p95 columns automatically and appends values from the + specified status category of the distribution summary. + + :param stats: Distribution summary containing status-specific statistics + :param status: Status category to extract statistics from + :param group: Optional group header for the columns + :param name: Column name for display + :param precision: Decimal precision for numbers + """ + key = f"{group}_{name}" + + if f"{key}_mean" not in self: + self[f"{key}_mean"] = ConsoleTableColumn( + group=group, name=name, units="Mean", precision=precision + ) + self[f"{key}_p95"] = ConsoleTableColumn( + group=group, name=name, units="p95", precision=precision + ) + + status_stats: DistributionSummary | None = ( + getattr(stats, status) if stats else None + ) + self[f"{key}_mean"].values.append(status_stats.mean if status_stats else None) + self[f"{key}_p95"].values.append( + status_stats.percentiles.p95 if status_stats else None + ) + + def get_table_data(self) -> tuple[list[list[str]], list[list[str]]]: + """ + Convert column collection to formatted table data. + + Transforms stored columns and values into header and value lists suitable for + console table rendering, applying type-specific formatting. + + :return: Tuple of (headers, values) where each is a list of column string lists + """ + headers: list[list[str]] = [] + values: list[list[str]] = [] + + for column in self.values(): + headers.append([column.group or "", column.name or "", column.units or ""]) + formatted_values: list[str] = [] + for value in column.values: + if column.type_ == "text": + formatted_values.append(str(value)) + continue + + if not isinstance(value, float | int) and value is not None: + raise ValueError( + f"Expected numeric value for column '{column.name}', " + f"got: {value}" + ) + + if column.type_ == "timestamp": + formatted_values.append( + safe_format_timestamp(cast("float | None", value)) + ) + elif column.type_ == "number": + formatted_values.append( + safe_format_number( + value, + precision=column.precision, + ) + ) + else: + raise ValueError(f"Unsupported column type: {column.type_}") + values.append(formatted_values) + + return headers, values + + +@GenerativeBenchmarkerOutput.register("console") +class GenerativeBenchmarkerConsole(GenerativeBenchmarkerOutput): + """ + Console output formatter for benchmark reports. + + Renders benchmark results as formatted tables in the terminal, organizing metrics + by category (run summary, request counts, latency, throughput, modality-specific) + with proper alignment and type-specific formatting for readability. + """ + + @classmethod + def validated_kwargs(cls, *_args, **_kwargs) -> dict[str, Any]: + """ + Validate and return keyword arguments for initialization. + + :return: Empty dict as no additional kwargs are required + """ + return {} + + console: Console = Field( + default_factory=Console, + description="Console utility for rendering formatted tables", + ) + + async def finalize(self, report: GenerativeBenchmarksReport) -> str: + """ + Print the complete benchmark report to the console. + + Renders all metric tables including run summary, request counts, latency, + throughput, and modality-specific statistics to the console. + + :param report: The completed benchmark report + :return: Status message indicating output location + """ + self.print_run_summary_table(report) + self.print_text_table(report) + self.print_image_table(report) + self.print_video_table(report) + self.print_audio_table(report) + self.print_request_counts_table(report) + self.print_request_latency_table(report) + self.print_server_throughput_table(report) + + return "printed to console" + + def print_run_summary_table(self, report: GenerativeBenchmarksReport): + """ + Print the run summary table with timing and token information. + + :param report: The benchmark report containing run metadata + """ + columns = ConsoleTableColumnsCollection() + + for benchmark in report.benchmarks: + columns.add_value( + benchmark.config.strategy.type_, + group="Benchmark", + name="Strategy", + type_="text", + ) + columns.add_value( + benchmark.start_time, group="Timings", name="Start", type_="timestamp" + ) + columns.add_value( + benchmark.end_time, group="Timings", name="End", type_="timestamp" + ) + columns.add_value( + benchmark.duration, group="Timings", name="Dur", units="Sec" + ) + columns.add_value( + report.args.warmup, group="Timings", name="Warm", units="Sec" + ) + columns.add_value( + report.args.cooldown, group="Timings", name="Cool", units="Sec" + ) + + for token_metrics, group in [ + (benchmark.metrics.prompt_token_count, "Input Tokens"), + (benchmark.metrics.output_token_count, "Output Tokens"), + ]: + columns.add_value( + token_metrics.successful.total_sum, + group=group, + name="Comp", + units="Tot", + ) + columns.add_value( + token_metrics.incomplete.total_sum, + group=group, + name="Inc", + units="Tot", + ) + columns.add_value( + token_metrics.errored.total_sum, + group=group, + name="Err", + units="Tot", + ) + + headers, values = columns.get_table_data() + self.console.print("\n") + self.console.print_table(headers, values, title="Run Summary Info") + + def print_text_table(self, report: GenerativeBenchmarksReport): + """ + Print text-specific metrics table if any text data exists. + + :param report: The benchmark report containing text metrics + """ + self._print_modality_table( + report=report, + modality="text", + title="Text Metrics Statistics (Completed Requests)", + metric_groups=[ + ("tokens", "Tokens"), + ("words", "Words"), + ("characters", "Characters"), + ], + ) + + def print_image_table(self, report: GenerativeBenchmarksReport): + """ + Print image-specific metrics table if any image data exists. + + :param report: The benchmark report containing image metrics + """ + self._print_modality_table( + report=report, + modality="image", + title="Image Metrics Statistics (Completed Requests)", + metric_groups=[ + ("tokens", "Tokens"), + ("images", "Images"), + ("pixels", "Pixels"), + ("bytes", "Bytes"), + ], + ) + + def print_video_table(self, report: GenerativeBenchmarksReport): + """ + Print video-specific metrics table if any video data exists. + + :param report: The benchmark report containing video metrics + """ + self._print_modality_table( + report=report, + modality="video", + title="Video Metrics Statistics (Completed Requests)", + metric_groups=[ + ("tokens", "Tokens"), + ("frames", "Frames"), + ("seconds", "Seconds"), + ("bytes", "Bytes"), + ], + ) + + def print_audio_table(self, report: GenerativeBenchmarksReport): + """ + Print audio-specific metrics table if any audio data exists. + + :param report: The benchmark report containing audio metrics + """ + self._print_modality_table( + report=report, + modality="audio", + title="Audio Metrics Statistics (Completed Requests)", + metric_groups=[ + ("tokens", "Tokens"), + ("samples", "Samples"), + ("seconds", "Seconds"), + ("bytes", "Bytes"), + ], + ) + + def print_request_counts_table(self, report: GenerativeBenchmarksReport): + """ + Print request token count statistics table. + + :param report: The benchmark report containing request count metrics + """ + columns = ConsoleTableColumnsCollection() + + for benchmark in report.benchmarks: + columns.add_value( + benchmark.config.strategy.type_, + group="Benchmark", + name="Strategy", + type_="text", + ) + columns.add_stats( + benchmark.metrics.prompt_token_count, + group="Input Tok", + name="Per Req", + ) + columns.add_stats( + benchmark.metrics.output_token_count, + group="Output Tok", + name="Per Req", + ) + columns.add_stats( + benchmark.metrics.total_token_count, + group="Total Tok", + name="Per Req", + ) + columns.add_stats( + benchmark.metrics.request_streaming_iterations_count, + group="Stream Iter", + name="Per Req", + ) + columns.add_stats( + benchmark.metrics.output_tokens_per_iteration, + group="Output Tok", + name="Per Stream Iter", + ) + + headers, values = columns.get_table_data() + self.console.print("\n") + self.console.print_table( + headers, + values, + title="Request Token Statistics (Completed Requests)", + ) + + def print_request_latency_table(self, report: GenerativeBenchmarksReport): + """ + Print request latency metrics table. + + :param report: The benchmark report containing latency metrics + """ + columns = ConsoleTableColumnsCollection() + + for benchmark in report.benchmarks: + columns.add_value( + benchmark.config.strategy.type_, + group="Benchmark", + name="Strategy", + type_="text", + ) + columns.add_stats( + benchmark.metrics.request_latency, + group="Request Latency", + name="Sec", + ) + columns.add_stats( + benchmark.metrics.time_to_first_token_ms, + group="TTFT", + name="ms", + ) + columns.add_stats( + benchmark.metrics.inter_token_latency_ms, + group="ITL", + name="ms", + ) + columns.add_stats( + benchmark.metrics.time_per_output_token_ms, + group="TPOT", + name="ms", + ) + + headers, values = columns.get_table_data() + self.console.print("\n") + self.console.print_table( + headers, + values, + title="Request Latency Statistics (Completed Requests)", + ) + + def print_server_throughput_table(self, report: GenerativeBenchmarksReport): + """ + Print server throughput metrics table. + + :param report: The benchmark report containing throughput metrics + """ + columns = ConsoleTableColumnsCollection() + + for benchmark in report.benchmarks: + columns.add_value( + benchmark.config.strategy.type_, + group="Benchmark", + name="Strategy", + type_="text", + ) + columns.add_stats( + benchmark.metrics.requests_per_second, + group="Requests", + name="Per Sec", + ) + columns.add_stats( + benchmark.metrics.request_concurrency, + group="Requests", + name="Concurrency", + ) + columns.add_stats( + benchmark.metrics.prompt_tokens_per_second, + group="Input Tokens", + name="Per Sec", + ) + columns.add_stats( + benchmark.metrics.output_tokens_per_second, + group="Output Tokens", + name="Per Sec", + ) + columns.add_stats( + benchmark.metrics.tokens_per_second, + group="Total Tokens", + name="Per Sec", + ) + + headers, values = columns.get_table_data() + self.console.print("\n") + self.console.print_table(headers, values, title="Server Throughput Statistics") + + def _print_modality_table( + self, + report: GenerativeBenchmarksReport, + modality: Literal["text", "image", "video", "audio"], + title: str, + metric_groups: list[tuple[str, str]], + ): + columns: dict[str, ConsoleTableColumnsCollection] = defaultdict( + ConsoleTableColumnsCollection + ) + + for benchmark in report.benchmarks: + columns["labels"].add_value( + benchmark.config.strategy.type_, + group="Benchmark", + name="Strategy", + type_="text", + ) + + modality_metrics = getattr(benchmark.metrics, modality) + + for metric_attr, display_name in metric_groups: + metric_obj = getattr(modality_metrics, metric_attr, None) + self._add_input_output_stats( + columns=columns, + metric_obj=metric_obj, + metric_key=metric_attr, + display_name=display_name, + ) + + self._print_inp_out_tables( + title=title, + labels=columns["labels"], + groups=[ + (columns[f"{metric_attr}.input"], columns[f"{metric_attr}.output"]) + for metric_attr, _ in metric_groups + ], + ) + + def _print_inp_out_tables( + self, + title: str, + labels: ConsoleTableColumnsCollection, + groups: list[ + tuple[ConsoleTableColumnsCollection, ConsoleTableColumnsCollection] + ], + ): + input_headers, input_values = [], [] + output_headers, output_values = [], [] + input_has_data = False + output_has_data = False + + for input_columns, output_columns in groups: + # Check if columns have any non-None values + type_input_has_data = any( + any(value is not None for value in column.values) + for column in input_columns.values() + ) + type_output_has_data = any( + any(value is not None for value in column.values) + for column in output_columns.values() + ) + + if not (type_input_has_data or type_output_has_data): + continue + + input_has_data = input_has_data or type_input_has_data + output_has_data = output_has_data or type_output_has_data + + input_type_headers, input_type_columns = input_columns.get_table_data() + output_type_headers, output_type_columns = output_columns.get_table_data() + + input_headers.extend(input_type_headers) + input_values.extend(input_type_columns) + output_headers.extend(output_type_headers) + output_values.extend(output_type_columns) + + if not (input_has_data or output_has_data): + return + + labels_headers, labels_values = labels.get_table_data() + header_cols_groups = [] + value_cols_groups = [] + + if input_has_data: + header_cols_groups.append(labels_headers + input_headers) + value_cols_groups.append(labels_values + input_values) + if output_has_data: + header_cols_groups.append(labels_headers + output_headers) + value_cols_groups.append(labels_values + output_values) + + if header_cols_groups and value_cols_groups: + self.console.print("\n") + self.console.print_tables( + header_cols_groups=header_cols_groups, + value_cols_groups=value_cols_groups, + title=title, + ) + + def _add_input_output_stats( + self, + columns: dict[str, ConsoleTableColumnsCollection], + metric_obj: Any, + metric_key: str, + display_name: str, + ): + input_stats: StatusDistributionSummary | None = ( + getattr(metric_obj, "input", None) if metric_obj else None + ) + input_per_second_stats: StatusDistributionSummary | None = ( + getattr(metric_obj, "input_per_second", None) if metric_obj else None + ) + output_stats: StatusDistributionSummary | None = ( + getattr(metric_obj, "output", None) if metric_obj else None + ) + output_per_second_stats: StatusDistributionSummary | None = ( + getattr(metric_obj, "output_per_second", None) if metric_obj else None + ) + + columns[f"{metric_key}.input"].add_stats( + input_stats, + group=f"Input {display_name}", + name="Per Request", + ) + columns[f"{metric_key}.input"].add_stats( + input_per_second_stats, + group=f"Input {display_name}", + name="Per Second", + ) + columns[f"{metric_key}.output"].add_stats( + output_stats, + group=f"Output {display_name}", + name="Per Request", + ) + columns[f"{metric_key}.output"].add_stats( + output_per_second_stats, + group=f"Output {display_name}", + name="Per Second", + ) diff --git a/src/guidellm/benchmark/outputs/csv.py b/src/guidellm/benchmark/outputs/csv.py new file mode 100644 index 00000000..c1ea2479 --- /dev/null +++ b/src/guidellm/benchmark/outputs/csv.py @@ -0,0 +1,692 @@ +""" +CSV output formatter for benchmark results. + +This module provides the GenerativeBenchmarkerCSV class which exports benchmark +reports to CSV format with comprehensive metrics including timing, throughput, +latency, modality data, and scheduler information. The CSV output uses multi-row +headers to organize metrics hierarchically and includes both summary statistics +and distribution percentiles. +""" + +from __future__ import annotations + +import csv +import json +from pathlib import Path +from typing import Annotated, Any, ClassVar, Literal + +from pydantic import Field + +from guidellm.benchmark.outputs.output import GenerativeBenchmarkerOutput +from guidellm.benchmark.schemas import GenerativeBenchmark, GenerativeBenchmarksReport +from guidellm.schemas import DistributionSummary, StatusDistributionSummary +from guidellm.utils import safe_format_timestamp + +__all__ = ["GenerativeBenchmarkerCSV"] + +TIMESTAMP_FORMAT: Annotated[str, "Format string for timestamp output in CSV files"] = ( + "%Y-%m-%d %H:%M:%S" +) +MODALITY_METRICS: Annotated[ + dict[str, list[tuple[str, str]]], + "Mapping of modality types to their metric names and display labels", +] = { + "text": [ + ("tokens", "Tokens"), + ("words", "Words"), + ("characters", "Characters"), + ], + "image": [ + ("tokens", "Tokens"), + ("images", "Images"), + ("pixels", "Pixels"), + ("bytes", "Bytes"), + ], + "video": [ + ("tokens", "Tokens"), + ("frames", "Frames"), + ("seconds", "Seconds"), + ("bytes", "Bytes"), + ], + "audio": [ + ("tokens", "Tokens"), + ("samples", "Samples"), + ("seconds", "Seconds"), + ("bytes", "Bytes"), + ], +} + + +@GenerativeBenchmarkerOutput.register("csv") +class GenerativeBenchmarkerCSV(GenerativeBenchmarkerOutput): + """ + CSV output formatter for benchmark results. + + Exports comprehensive benchmark data to CSV format with multi-row headers + organizing metrics into categories including run information, timing, request + counts, latency, throughput, modality-specific data, and scheduler state. Each + benchmark run becomes a row with statistical distributions represented as + mean, median, standard deviation, and percentiles. + + :cvar DEFAULT_FILE: Default filename for CSV output + """ + + DEFAULT_FILE: ClassVar[str] = "benchmarks.csv" + + @classmethod + def validated_kwargs( + cls, output_path: str | Path | None, **_kwargs + ) -> dict[str, Any]: + """ + Validate and normalize constructor keyword arguments. + + :param output_path: Path for CSV output file or directory + :param _kwargs: Additional keyword arguments (ignored) + :return: Normalized keyword arguments dictionary + """ + new_kwargs = {} + if output_path is not None: + new_kwargs["output_path"] = ( + Path(output_path) if not isinstance(output_path, Path) else output_path + ) + return new_kwargs + + output_path: Path = Field( + default_factory=lambda: Path.cwd(), + description=( + "Path where the CSV file will be saved, defaults to current directory" + ), + ) + + async def finalize(self, report: GenerativeBenchmarksReport) -> Path: + """ + Save the benchmark report as a CSV file. + + :param report: The completed benchmark report + :return: Path to the saved CSV file + """ + output_path = self.output_path + if output_path.is_dir(): + output_path = output_path / GenerativeBenchmarkerCSV.DEFAULT_FILE + output_path.parent.mkdir(parents=True, exist_ok=True) + + with output_path.open("w", newline="") as file: + writer = csv.writer(file) + headers: list[list[str]] = [] + rows: list[list[str | int | float]] = [] + + for benchmark in report.benchmarks: + benchmark_headers: list[list[str]] = [] + benchmark_values: list[str | int | float] = [] + + self._add_run_info(benchmark, benchmark_headers, benchmark_values) + self._add_benchmark_info(benchmark, benchmark_headers, benchmark_values) + self._add_timing_info(benchmark, benchmark_headers, benchmark_values) + self._add_request_counts(benchmark, benchmark_headers, benchmark_values) + self._add_request_latency_metrics( + benchmark, benchmark_headers, benchmark_values + ) + self._add_server_throughput_metrics( + benchmark, benchmark_headers, benchmark_values + ) + for modality_name in ["text", "image", "video", "audio"]: + self._add_modality_metrics( + benchmark, + modality_name, # type: ignore[arg-type] + benchmark_headers, + benchmark_values, + ) + self._add_scheduler_info(benchmark, benchmark_headers, benchmark_values) + + if not headers: + headers = benchmark_headers + rows.append(benchmark_values) + + self._write_multirow_header(writer, headers) + for row in rows: + writer.writerow(row) + + return output_path + + def _write_multirow_header(self, writer: Any, headers: list[list[str]]) -> None: + """ + Write multi-row header to CSV for hierarchical metric organization. + + :param writer: CSV writer instance + :param headers: List of column header hierarchies as string lists + """ + max_rows = max((len(col) for col in headers), default=0) + for row_idx in range(max_rows): + row = [col[row_idx] if row_idx < len(col) else "" for col in headers] + writer.writerow(row) + + def _add_field( + self, + headers: list[list[str]], + values: list[str | int | float], + group: str, + field_name: str, + value: Any, + units: str = "", + ) -> None: + """ + Add a single field to headers and values lists. + + :param headers: List of header hierarchies to append to + :param values: List of values to append to + :param group: Top-level category for the field + :param field_name: Name of the field + :param value: Value for the field + :param units: Optional units for the field + """ + headers.append([group, field_name, units]) + values.append(value) + + def _add_run_info( + self, + benchmark: GenerativeBenchmark, + headers: list[list[str]], + values: list[str | int | float], + ) -> None: + """ + Add overall run identification and configuration information. + + :param benchmark: Benchmark data to extract run info from + :param headers: List of header hierarchies to append to + :param values: List of values to append to + """ + self._add_field(headers, values, "Run Info", "Run ID", benchmark.config.run_id) + self._add_field( + headers, values, "Run Info", "Run Index", benchmark.config.run_index + ) + self._add_field( + headers, + values, + "Run Info", + "Profile", + benchmark.config.profile.model_dump_json(), + ) + self._add_field( + headers, + values, + "Run Info", + "Requests", + json.dumps(benchmark.config.requests), + ) + self._add_field( + headers, values, "Run Info", "Backend", json.dumps(benchmark.config.backend) + ) + self._add_field( + headers, + values, + "Run Info", + "Environment", + json.dumps(benchmark.config.environment), + ) + + def _add_benchmark_info( + self, + benchmark: GenerativeBenchmark, + headers: list[list[str]], + values: list[str | int | float], + ) -> None: + """ + Add individual benchmark configuration details. + + :param benchmark: Benchmark data to extract configuration from + :param headers: List of header hierarchies to append to + :param values: List of values to append to + """ + self._add_field(headers, values, "Benchmark", "Type", benchmark.type_) + self._add_field(headers, values, "Benchmark", "ID", benchmark.config.id_) + self._add_field( + headers, values, "Benchmark", "Strategy", benchmark.config.strategy.type_ + ) + self._add_field( + headers, + values, + "Benchmark", + "Constraints", + json.dumps(benchmark.config.constraints), + ) + + def _add_timing_info( + self, + benchmark: GenerativeBenchmark, + headers: list[list[str]], + values: list[str | int | float], + ) -> None: + """ + Add timing information including start, end, duration, warmup, and cooldown. + + :param benchmark: Benchmark data to extract timing from + :param headers: List of header hierarchies to append to + :param values: List of values to append to + """ + timing_fields: list[tuple[str, Any]] = [ + ("Start Time", benchmark.scheduler_metrics.start_time), + ("Request Start Time", benchmark.scheduler_metrics.request_start_time), + ("Measure Start Time", benchmark.scheduler_metrics.measure_start_time), + ("Measure End Time", benchmark.scheduler_metrics.measure_end_time), + ("Request End Time", benchmark.scheduler_metrics.request_end_time), + ("End Time", benchmark.scheduler_metrics.end_time), + ] + for field_name, timestamp in timing_fields: + self._add_field( + headers, + values, + "Timings", + field_name, + safe_format_timestamp(timestamp, TIMESTAMP_FORMAT), + ) + + duration_fields: list[tuple[str, float]] = [ + ("Duration", benchmark.duration), + ("Warmup", benchmark.config.warmup or 0.0), + ("Cooldown", benchmark.config.cooldown or 0.0), + ] + for field_name, duration_value in duration_fields: + self._add_field( + headers, values, "Timings", field_name, duration_value, "Sec" + ) + + def _add_request_counts( + self, + benchmark: GenerativeBenchmark, + headers: list[list[str]], + values: list[str | int | float], + ) -> None: + """ + Add request count totals by status. + + :param benchmark: Benchmark data to extract request counts from + :param headers: List of header hierarchies to append to + :param values: List of values to append to + """ + for status in ["successful", "incomplete", "errored", "total"]: + self._add_field( + headers, + values, + "Request Counts", + status.capitalize(), + getattr(benchmark.metrics.request_totals, status), + ) + + def _add_request_latency_metrics( + self, + benchmark: GenerativeBenchmark, + headers: list[list[str]], + values: list[str | int | float], + ) -> None: + """ + Add request latency and streaming metrics. + + :param benchmark: Benchmark data to extract latency metrics from + :param headers: List of header hierarchies to append to + :param values: List of values to append to + """ + self._add_stats_for_metric( + headers, values, benchmark.metrics.request_latency, "Request Latency", "Sec" + ) + self._add_stats_for_metric( + headers, + values, + benchmark.metrics.request_streaming_iterations_count, + "Streaming Iterations", + "Count", + ) + self._add_stats_for_metric( + headers, + values, + benchmark.metrics.time_to_first_token_ms, + "Time to First Token", + "ms", + ) + self._add_stats_for_metric( + headers, + values, + benchmark.metrics.time_per_output_token_ms, + "Time per Output Token", + "ms", + ) + self._add_stats_for_metric( + headers, + values, + benchmark.metrics.inter_token_latency_ms, + "Inter Token Latency", + "ms", + ) + + def _add_server_throughput_metrics( + self, + benchmark: GenerativeBenchmark, + headers: list[list[str]], + values: list[str | int | float], + ) -> None: + """ + Add server throughput metrics including requests, tokens, and concurrency. + + :param benchmark: Benchmark data to extract throughput metrics from + :param headers: List of header hierarchies to append to + :param values: List of values to append to + """ + self._add_stats_for_metric( + headers, + values, + benchmark.metrics.requests_per_second, + "Server Throughput", + "Requests/Sec", + ) + self._add_stats_for_metric( + headers, + values, + benchmark.metrics.request_concurrency, + "Server Throughput", + "Concurrency", + ) + self._add_stats_for_metric( + headers, + values, + benchmark.metrics.prompt_token_count, + "Token Metrics", + "Input Tokens", + ) + self._add_stats_for_metric( + headers, + values, + benchmark.metrics.output_token_count, + "Token Metrics", + "Output Tokens", + ) + self._add_stats_for_metric( + headers, + values, + benchmark.metrics.total_token_count, + "Token Metrics", + "Total Tokens", + ) + self._add_stats_for_metric( + headers, + values, + benchmark.metrics.prompt_tokens_per_second, + "Token Throughput", + "Input Tokens/Sec", + ) + self._add_stats_for_metric( + headers, + values, + benchmark.metrics.output_tokens_per_second, + "Token Throughput", + "Output Tokens/Sec", + ) + self._add_stats_for_metric( + headers, + values, + benchmark.metrics.tokens_per_second, + "Token Throughput", + "Total Tokens/Sec", + ) + self._add_stats_for_metric( + headers, + values, + benchmark.metrics.output_tokens_per_iteration, + "Token Streaming", + "Output Tokens/Iter", + ) + self._add_stats_for_metric( + headers, + values, + benchmark.metrics.iter_tokens_per_iteration, + "Token Streaming", + "Iter Tokens/Iter", + ) + + def _add_modality_metrics( + self, + benchmark: GenerativeBenchmark, + modality: Literal["text", "image", "video", "audio"], + headers: list[list[str]], + values: list[str | int | float], + ) -> None: + """ + Add modality-specific metrics for text, image, video, or audio data. + + :param benchmark: Benchmark data to extract modality metrics from + :param modality: Type of modality to extract metrics for + :param headers: List of header hierarchies to append to + :param values: List of values to append to + """ + modality_summary = getattr(benchmark.metrics, modality) + metric_definitions = MODALITY_METRICS[modality] + + for metric_name, display_name in metric_definitions: + metric_obj = getattr(modality_summary, metric_name, None) + if metric_obj is None: + continue + + for io_type in ["input", "output", "total"]: + dist_summary = getattr(metric_obj, io_type, None) + if dist_summary is None: + continue + + if not self._has_distribution_data(dist_summary): + continue + + self._add_stats_for_metric( + headers, + values, + dist_summary, + f"{modality.capitalize()} {display_name}", + io_type.capitalize(), + ) + + def _has_distribution_data(self, dist_summary: StatusDistributionSummary) -> bool: + """ + Check if distribution summary contains any data. + + :param dist_summary: Distribution summary to check + :return: True if summary contains data, False otherwise + """ + return any( + getattr(dist_summary, status, None) is not None + and getattr(dist_summary, status).total_sum > 0.0 + for status in ["successful", "incomplete", "errored"] + ) + + def _add_scheduler_info( + self, + benchmark: GenerativeBenchmark, + headers: list[list[str]], + values: list[str | int | float], + ) -> None: + """ + Add scheduler state and performance information. + + :param benchmark: Benchmark data to extract scheduler info from + :param headers: List of header hierarchies to append to + :param values: List of values to append to + """ + self._add_scheduler_state(benchmark, headers, values) + self._add_scheduler_metrics(benchmark, headers, values) + + def _add_scheduler_state( + self, + benchmark: GenerativeBenchmark, + headers: list[list[str]], + values: list[str | int | float], + ) -> None: + """ + Add scheduler state information including request counts and timing. + + :param benchmark: Benchmark data to extract scheduler state from + :param headers: List of header hierarchies to append to + :param values: List of values to append to + """ + state = benchmark.scheduler_state + + state_fields: list[tuple[str, Any]] = [ + ("Node ID", state.node_id), + ("Num Processes", state.num_processes), + ("Created Requests", state.created_requests), + ("Processed Requests", state.processed_requests), + ("Successful Requests", state.successful_requests), + ("Errored Requests", state.errored_requests), + ("Cancelled Requests", state.cancelled_requests), + ] + + for field_name, value in state_fields: + self._add_field(headers, values, "Scheduler State", field_name, value) + + if state.end_queuing_time: + self._add_field( + headers, + values, + "Scheduler State", + "End Queuing Time", + safe_format_timestamp(state.end_queuing_time, TIMESTAMP_FORMAT), + ) + end_queuing_constraints_dict = { + key: constraint.model_dump() + for key, constraint in state.end_queuing_constraints.items() + } + self._add_field( + headers, + values, + "Scheduler State", + "End Queuing Constraints", + json.dumps(end_queuing_constraints_dict), + ) + + if state.end_processing_time: + self._add_field( + headers, + values, + "Scheduler State", + "End Processing Time", + safe_format_timestamp(state.end_processing_time, TIMESTAMP_FORMAT), + ) + end_processing_constraints_dict = { + key: constraint.model_dump() + for key, constraint in state.end_processing_constraints.items() + } + self._add_field( + headers, + values, + "Scheduler State", + "End Processing Constraints", + json.dumps(end_processing_constraints_dict), + ) + + def _add_scheduler_metrics( + self, + benchmark: GenerativeBenchmark, + headers: list[list[str]], + values: list[str | int | float], + ) -> None: + """ + Add scheduler performance metrics including delays and processing times. + + :param benchmark: Benchmark data to extract scheduler metrics from + :param headers: List of header hierarchies to append to + :param values: List of values to append to + """ + metrics = benchmark.scheduler_metrics + + requests_made_fields: list[tuple[str, int]] = [ + ("Requests Made Successful", metrics.requests_made.successful), + ("Requests Made Incomplete", metrics.requests_made.incomplete), + ("Requests Made Errored", metrics.requests_made.errored), + ("Requests Made Total", metrics.requests_made.total), + ] + for field_name, value in requests_made_fields: + self._add_field(headers, values, "Scheduler Metrics", field_name, value) + + timing_metrics: list[tuple[str, float]] = [ + ("Queued Time Avg", metrics.queued_time_avg), + ("Resolve Start Delay Avg", metrics.resolve_start_delay_avg), + ( + "Resolve Targeted Start Delay Avg", + metrics.resolve_targeted_start_delay_avg, + ), + ("Request Start Delay Avg", metrics.request_start_delay_avg), + ( + "Request Targeted Start Delay Avg", + metrics.request_targeted_start_delay_avg, + ), + ("Request Time Avg", metrics.request_time_avg), + ("Resolve End Delay Avg", metrics.resolve_end_delay_avg), + ("Resolve Time Avg", metrics.resolve_time_avg), + ("Finalized Delay Avg", metrics.finalized_delay_avg), + ("Processed Delay Avg", metrics.processed_delay_avg), + ] + for field_name, timing in timing_metrics: + self._add_field( + headers, values, "Scheduler Metrics", field_name, timing, "Sec" + ) + + def _add_stats_for_metric( + self, + headers: list[list[str]], + values: list[str | int | float], + metric: StatusDistributionSummary | DistributionSummary, + group: str, + units: str, + ) -> None: + """ + Add statistical summaries for a metric across all statuses. + + :param headers: List of header hierarchies to append to + :param values: List of values to append to + :param metric: Distribution summary to extract statistics from + :param group: Top-level category for the metric + :param units: Units for the metric values + """ + if isinstance(metric, StatusDistributionSummary): + for status in ["successful", "incomplete", "errored"]: + dist = getattr(metric, status, None) + if dist is None or dist.total_sum == 0.0: + continue + self._add_distribution_stats( + headers, values, dist, group, units, status + ) + else: + self._add_distribution_stats(headers, values, metric, group, units, None) + + def _add_distribution_stats( + self, + headers: list[list[str]], + values: list[str | int | float], + dist: DistributionSummary, + group: str, + units: str, + status: str | None, + ) -> None: + """ + Add distribution statistics including mean, median, and percentiles. + + :param headers: List of header hierarchies to append to + :param values: List of values to append to + :param dist: Distribution summary with statistical data + :param group: Top-level category for the metric + :param units: Units for the metric values + :param status: Request status (successful, incomplete, errored) or None + """ + status_prefix = f"{status.capitalize()} " if status else "" + + headers.append([group, f"{status_prefix}{units}", "Mean"]) + values.append(dist.mean) + + headers.append([group, f"{status_prefix}{units}", "Median"]) + values.append(dist.median) + + headers.append([group, f"{status_prefix}{units}", "Std Dev"]) + values.append(dist.std_dev) + + headers.append([group, f"{status_prefix}{units}", "Percentiles"]) + percentiles_str = ( + f"[{dist.min}, {dist.percentiles.p001}, {dist.percentiles.p01}, " + f"{dist.percentiles.p05}, {dist.percentiles.p10}, {dist.percentiles.p25}, " + f"{dist.percentiles.p75}, {dist.percentiles.p90}, {dist.percentiles.p95}, " + f"{dist.percentiles.p99}, {dist.max}]" + ) + values.append(percentiles_str) diff --git a/src/guidellm/benchmark/outputs/html.py b/src/guidellm/benchmark/outputs/html.py new file mode 100644 index 00000000..34cf7107 --- /dev/null +++ b/src/guidellm/benchmark/outputs/html.py @@ -0,0 +1,422 @@ +""" +HTML output formatter for benchmark results. + +Transforms benchmark data into interactive web-based reports by building UI data +structures, converting keys to camelCase for JavaScript compatibility, and injecting +formatted data into HTML templates. The formatter processes GenerativeBenchmark +instances and their associated metrics, creating histogram buckets for distributions, +formatting percentile statistics for tabular display, and embedding all data as +JavaScript objects within an HTML template for client-side rendering and visualization. +""" + +from __future__ import annotations + +import json +import random +import re +from collections import defaultdict +from copy import deepcopy +from math import ceil +from pathlib import Path +from typing import Any, ClassVar + +from loguru import logger +from pydantic import BaseModel, Field, computed_field + +from guidellm.benchmark.outputs.output import GenerativeBenchmarkerOutput +from guidellm.benchmark.schemas import ( + BenchmarkGenerativeTextArgs, + GenerativeBenchmark, + GenerativeBenchmarksReport, +) +from guidellm.schemas import DistributionSummary +from guidellm.settings import settings +from guidellm.utils import camelize_str, recursive_key_update +from guidellm.utils.text import load_text + +__all__ = ["GenerativeBenchmarkerHTML"] + + +@GenerativeBenchmarkerOutput.register("html") +class GenerativeBenchmarkerHTML(GenerativeBenchmarkerOutput): + """ + HTML output formatter for benchmark results. + + Generates interactive HTML reports from benchmark data by transforming results + into camelCase JSON structures and injecting them into HTML templates. The + formatter processes benchmark metrics, creates histogram distributions, and + embeds all data into a pre-built HTML template for browser-based visualization. + Reports are saved to the specified output path or current working directory. + + :cvar DEFAULT_FILE: Default filename for HTML output when a directory is provided + """ + + DEFAULT_FILE: ClassVar[str] = "benchmarks.html" + + output_path: Path = Field( + default_factory=lambda: Path.cwd(), + description=( + "Directory or file path for saving the HTML report, " + "defaults to current working directory" + ), + ) + + @classmethod + def validated_kwargs( + cls, output_path: str | Path | None, **_kwargs + ) -> dict[str, Any]: + """ + Validate and normalize output path argument. + + :param output_path: Output file or directory path for the HTML report + :return: Dictionary containing validated output_path if provided + """ + validated: dict[str, Any] = {} + if output_path is not None: + validated["output_path"] = ( + Path(output_path) if not isinstance(output_path, Path) else output_path + ) + return validated + + async def finalize(self, report: GenerativeBenchmarksReport) -> Path: + """ + Generate and save the HTML benchmark report. + + Transforms benchmark data into camelCase JSON format, injects it into the + HTML template, and writes the resulting report to the output path. Creates + parent directories if they don't exist. + + :param report: Completed benchmark report containing all results + :return: Path to the saved HTML report file + """ + output_path = self.output_path + if output_path.is_dir(): + output_path = output_path / self.DEFAULT_FILE + output_path.parent.mkdir(parents=True, exist_ok=True) + + data = _build_ui_data(report.benchmarks, report.args) + camel_data = recursive_key_update(deepcopy(data), camelize_str) + + ui_api_data = { + f"window.{key} = {{}};": f"window.{key} = {json.dumps(value, indent=2)};\n" + for key, value in camel_data.items() + } + + _create_html_report(ui_api_data, output_path) + + return output_path + + +class _Bucket(BaseModel): + """ + Histogram bucket for data distribution visualization. + + Represents a single bucket in a histogram with its starting value and count + of data points falling within the bucket range. Used to create distribution + histograms for metrics like token counts and request timings. + """ + + value: float | int = Field(description="Starting value of the bucket range") + count: int = Field(description="Number of data points falling within this bucket") + + @staticmethod + def from_data( + data: list[float] | list[int], + bucket_width: float | None = None, + n_buckets: int | None = None, + ) -> tuple[list[_Bucket], float]: + """ + Create histogram buckets from numeric data values. + + Divides the data range into equal-width buckets and counts values within + each bucket. Either bucket_width or n_buckets can be specified; if neither + is provided, defaults to 10 buckets. + + :param data: Numeric values to bucket + :param bucket_width: Width of each bucket, computed if None + :param n_buckets: Number of buckets, defaults to 10 if width not specified + :return: Tuple of bucket list and computed bucket width + """ + if not data: + return [], 1.0 + + min_v = min(data) + max_v = max(data) + range_v = (1 + max_v) - min_v + + if bucket_width is None: + if n_buckets is None: + n_buckets = 10 + bucket_width = range_v / n_buckets + else: + n_buckets = ceil(range_v / bucket_width) + + bucket_counts: defaultdict[float | int, int] = defaultdict(int) + for val in data: + idx = int((val - min_v) // bucket_width) + if idx >= n_buckets: + idx = n_buckets - 1 + bucket_start = min_v + idx * bucket_width + bucket_counts[bucket_start] += 1 + + buckets = [ + _Bucket(value=start, count=count) + for start, count in sorted(bucket_counts.items()) + ] + return buckets, bucket_width + + +class _TabularDistributionSummary(DistributionSummary): + """ + Distribution summary with tabular percentile representation. + + Extends DistributionSummary to provide percentile data formatted for table + display in the HTML report. Filters to show only key percentiles (p50, p90, + p95, p99) for concise presentation. + """ + + @computed_field + def percentile_rows(self) -> list[dict[str, str | float]]: + """ + Format percentiles as table rows for UI display. + + :return: List of dictionaries with percentile names and values + """ + rows = [ + {"percentile": name, "value": value} + for name, value in self.percentiles.model_dump().items() + ] + return list( + filter(lambda row: row["percentile"] in ["p50", "p90", "p95", "p99"], rows) + ) + + @classmethod + def from_distribution_summary( + cls, distribution: DistributionSummary + ) -> _TabularDistributionSummary: + """ + Convert standard DistributionSummary to tabular format. + + :param distribution: Source distribution summary to convert + :return: Tabular distribution summary with formatted percentile rows + """ + return cls(**distribution.model_dump()) + + +def _create_html_report(js_data: dict[str, str], output_path: Path) -> Path: + """ + Create HTML report by injecting JavaScript data into template. + + Loads the HTML template, injects JavaScript data into the head section, and + writes the final report to the specified output path. + + :param js_data: Dictionary mapping placeholder strings to JavaScript code + :param output_path: Path where HTML report will be saved + :return: Path to the saved report file + """ + html_content = load_text(settings.report_generation.source) + report_content = _inject_data(js_data, html_content) + + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(report_content) + return output_path + + +def _inject_data(js_data: dict[str, str], html: str) -> str: + """ + Inject JavaScript data into HTML head section. + + Replaces placeholder strings in the HTML head section with actual JavaScript + code containing benchmark data. Returns original HTML if no head section found. + + :param js_data: Dictionary mapping placeholder strings to JavaScript code + :param html: HTML template content + :return: HTML with injected JavaScript data + """ + head_match = re.search(r"
]*>(.*?)", html, re.DOTALL | re.IGNORECASE) + if not head_match: + logger.warning(" section missing, returning original HTML.") + return html + + head_content = head_match.group(1) + + for placeholder, script in js_data.items(): + head_content = head_content.replace(placeholder, script) + + new_head = f"{head_content}" + return html[: head_match.start()] + new_head + html[head_match.end() :] + + +def _build_ui_data( + benchmarks: list[GenerativeBenchmark], args: BenchmarkGenerativeTextArgs +) -> dict[str, Any]: + """ + Build complete UI data structure from benchmarks. + + Aggregates benchmark results into a structured format for the HTML UI, + including run metadata, workload details, and per-benchmark metrics. + + :param benchmarks: List of completed benchmark results + :param args: Benchmark configuration arguments + :return: Dictionary with run_info, workload_details, and benchmarks sections + """ + return { + "run_info": _build_run_info(benchmarks, args), + "workload_details": _build_workload_details(benchmarks, args), + "benchmarks": _build_benchmarks(benchmarks), + } + + +def _build_run_info( + benchmarks: list[GenerativeBenchmark], args: BenchmarkGenerativeTextArgs +) -> dict[str, Any]: + """ + Build run metadata from benchmarks. + + Extracts model name, timestamp, and dataset information from the benchmark + configuration and results. + + :param benchmarks: List of completed benchmark results + :param args: Benchmark configuration arguments + :return: Dictionary with model, task, timestamp, and dataset information + """ + model = args.model or "N/A" + timestamp = max(bm.start_time for bm in benchmarks if bm.start_time is not None) + return { + "model": {"name": model, "size": 0}, + "task": "N/A", + "timestamp": timestamp, + "dataset": {"name": "N/A"}, + } + + +def _build_workload_details( + benchmarks: list[GenerativeBenchmark], args: BenchmarkGenerativeTextArgs +) -> dict[str, Any]: + """ + Build workload details from benchmarks. + + Aggregates prompt and generation samples, token distribution statistics, + request timing histograms, and server configuration. Samples up to 5 random + prompts and outputs for display. + + :param benchmarks: List of completed benchmark results + :param args: Benchmark configuration arguments + :return: Dictionary with prompts, generations, request timing, and server info + """ + target = args.target + rate_type = benchmarks[0].config.strategy.type_ + successful_requests = [req for bm in benchmarks for req in bm.requests.successful] + + sample_indices = random.sample( + range(len(successful_requests)), min(5, len(successful_requests)) + ) + sample_prompts = [ + req.request_args.replace("\n", " ").replace('"', "'") + if (req := successful_requests[i]).request_args + else "" + for i in sample_indices + ] + sample_outputs = [ + req.output.replace("\n", " ").replace('"', "'") + if (req := successful_requests[i]).output + else "" + for i in sample_indices + ] + + prompt_tokens = [ + float(req.prompt_tokens) if req.prompt_tokens is not None else -1 + for bm in benchmarks + for req in bm.requests.successful + ] + output_tokens = [ + float(req.output_tokens) if req.output_tokens is not None else -1 + for bm in benchmarks + for req in bm.requests.successful + ] + + prompt_token_buckets, _prompt_bucket_width = _Bucket.from_data(prompt_tokens, 1) + output_token_buckets, _output_bucket_width = _Bucket.from_data(output_tokens, 1) + + prompt_token_stats = DistributionSummary.from_values(prompt_tokens) + output_token_stats = DistributionSummary.from_values(output_tokens) + + min_start_time = benchmarks[0].start_time + all_req_times = [ + req.info.timings.request_start - min_start_time + for bm in benchmarks + for req in bm.requests.successful + if req.info.timings.request_start is not None + ] + + number_of_buckets = len(benchmarks) + request_buckets, bucket_width = _Bucket.from_data( + all_req_times, None, number_of_buckets + ) + + return { + "prompts": { + "samples": sample_prompts, + "token_distributions": { + "statistics": prompt_token_stats.model_dump() + if prompt_token_stats + else None, + "buckets": [b.model_dump() for b in prompt_token_buckets], + "bucket_width": 1, + }, + }, + "generations": { + "samples": sample_outputs, + "token_distributions": { + "statistics": output_token_stats.model_dump() + if output_token_stats + else None, + "buckets": [b.model_dump() for b in output_token_buckets], + "bucket_width": 1, + }, + }, + "requests_over_time": { + "requests_over_time": { + "buckets": [b.model_dump() for b in request_buckets], + "bucket_width": bucket_width, + }, + "num_benchmarks": number_of_buckets, + }, + "rate_type": rate_type, + "server": {"target": target}, + } + + +def _build_benchmarks(benchmarks: list[GenerativeBenchmark]) -> list[dict[str, Any]]: + """ + Build benchmark metrics data for UI display. + + Extracts key performance metrics from each benchmark including requests per + second, inter-token latency, time to first token, throughput, and request + latency. Formats distribution summaries for tabular display. + + :param benchmarks: List of completed benchmark results + :return: List of dictionaries with formatted benchmark metrics + """ + result = [] + for bm in benchmarks: + result.append( + { + "requests_per_second": bm.metrics.requests_per_second.successful.mean, + "itl": _TabularDistributionSummary.from_distribution_summary( + bm.metrics.inter_token_latency_ms.successful + ).model_dump(), + "ttft": _TabularDistributionSummary.from_distribution_summary( + bm.metrics.time_to_first_token_ms.successful + ).model_dump(), + "throughput": _TabularDistributionSummary.from_distribution_summary( + bm.metrics.output_tokens_per_second.successful + ).model_dump(), + "time_per_request": ( + _TabularDistributionSummary.from_distribution_summary( + bm.metrics.request_latency.successful + ).model_dump() + ), + } + ) + return result diff --git a/src/guidellm/benchmark/outputs/output.py b/src/guidellm/benchmark/outputs/output.py new file mode 100644 index 00000000..8eb021b0 --- /dev/null +++ b/src/guidellm/benchmark/outputs/output.py @@ -0,0 +1,158 @@ +""" +Base output interface for generative benchmarking results. + +This module defines the abstract base class for all benchmark output formatters in +the guidellm system. Output formatters transform benchmark reports into various file +formats (JSON, CSV, HTML, etc.) enabling flexible result persistence and analysis. +The module leverages a registry pattern for dynamic format resolution and supports +both direct instantiation and configuration-based initialization. +""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from collections.abc import Mapping, Sequence +from pathlib import Path +from typing import Any + +from pydantic import BaseModel, ConfigDict + +from guidellm.benchmark.schemas import GenerativeBenchmarksReport +from guidellm.utils import RegistryMixin + +__all__ = ["GenerativeBenchmarkerOutput"] + + +class GenerativeBenchmarkerOutput( + BaseModel, RegistryMixin[type["GenerativeBenchmarkerOutput"]], ABC +): + """ + Abstract base for benchmark output formatters with registry support. + + Defines the interface for transforming benchmark reports into various output + formats. Subclasses implement specific formatters (JSON, CSV, HTML) that can be + registered and resolved dynamically. Supports flexible initialization from string + identifiers, file paths, or configuration dictionaries enabling declarative + output configuration in benchmark runs. + + Example: + :: + # Register and resolve output formats + outputs = GenerativeBenchmarkerOutput.resolve( + output_formats=["json", "csv"], + output_path="./results" + ) + + # Finalize outputs with benchmark report + for output in outputs.values(): + await output.finalize(report) + """ + + model_config = ConfigDict( + extra="ignore", + arbitrary_types_allowed=True, + validate_assignment=True, + from_attributes=True, + use_enum_values=True, + ) + + @classmethod + @abstractmethod + def validated_kwargs(cls, *args, **kwargs) -> dict[str, Any]: + """ + Validate and normalize initialization arguments for output formatter. + + Processes positional and keyword arguments into a validated parameter + dictionary suitable for formatter instantiation. Subclasses implement + format-specific validation logic handling their unique parameter patterns. + + :param args: Positional arguments for formatter configuration + :param kwargs: Keyword arguments for formatter configuration + :return: Validated dictionary of parameters for formatter creation + :raises NotImplementedError: Must be implemented by subclasses + """ + ... + + @classmethod + def resolve( + cls, + output_formats: ( + Sequence[str] + | Mapping[str, Any | dict[str, Any] | GenerativeBenchmarkerOutput] + | None + ), + output_path: str | Path | None, + ) -> dict[str, GenerativeBenchmarkerOutput]: + """ + Resolve output format specifications into formatter instances. + + Supports multiple input patterns: format identifiers (["json", "csv"]), + file paths (["results.json"]), format configurations ({"json": {"indent": 2}}), + or pre-instantiated formatters. Registered format types are resolved from the + registry and instantiated with validated parameters. + + :param output_formats: Format specifications as sequence of identifiers/paths, + mapping of format configurations, or None for no outputs + :param output_path: Default output directory path for all formatters + :return: Dictionary mapping format keys to instantiated formatter instances + :raises TypeError: If format specification type is invalid + :raises ValueError: If format resolution or validation fails + """ + resolved: dict[str, GenerativeBenchmarkerOutput] = {} + + if not output_formats: + return resolved + + if isinstance(output_formats, list | tuple): + # convert to dict for uniform processing + formats_list = output_formats + output_formats = {} + for output_format in formats_list: + # Check for registered type, if not, then assume it's a file path + if cls.is_registered(output_format): + output_formats[output_format] = {} + else: + path = Path(output_format) + format_type = path.suffix[1:].lower() + output_formats[format_type] = {"output_path": path} + + for key, val in output_formats.items(): # type: ignore[union-attr] + if isinstance(val, GenerativeBenchmarkerOutput): + resolved[key] = val + else: + output_class = cls.get_registered_object(key) + if output_class is None: + available_formats = ( + list(cls.registry.keys()) if cls.registry else [] + ) + raise ValueError( + f"Output format '{key}' is not registered. " + f"Available formats: {available_formats}" + ) + + kwargs: dict[str, Any] = {"output_path": output_path} + + if isinstance(val, dict): + kwargs.update(val) + kwargs = output_class.validated_kwargs(**kwargs) + else: + kwargs = output_class.validated_kwargs(val, **kwargs) + + resolved[key] = output_class(**kwargs) + + return resolved + + @abstractmethod + async def finalize(self, report: GenerativeBenchmarksReport) -> Any: + """ + Process and persist benchmark report in the formatter's output format. + + Transforms the provided benchmark report into the target format and writes + results to the configured output destination. Implementation details vary by + formatter type (file writing, API calls, etc.). + + :param report: Benchmark report containing results to format and output + :return: Format-specific output result (file path, response object, etc.) + :raises NotImplementedError: Must be implemented by subclasses + """ + ... diff --git a/src/guidellm/benchmark/outputs/serialized.py b/src/guidellm/benchmark/outputs/serialized.py new file mode 100644 index 00000000..52dc632a --- /dev/null +++ b/src/guidellm/benchmark/outputs/serialized.py @@ -0,0 +1,69 @@ +""" +Serialized output handler for generative benchmark reports. + +This module provides a serialized output implementation that saves benchmark reports +to JSON or YAML file formats. It extends the base GenerativeBenchmarkerOutput to +handle file-based persistence of benchmark results, supporting both directory and +explicit file path specifications for report serialization. +""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any + +from pydantic import Field + +from guidellm.benchmark.outputs.output import GenerativeBenchmarkerOutput +from guidellm.benchmark.schemas import GenerativeBenchmarksReport + +__all__ = ["GenerativeBenchmarkerSerialized"] + + +@GenerativeBenchmarkerOutput.register(["json", "yaml"]) +class GenerativeBenchmarkerSerialized(GenerativeBenchmarkerOutput): + """ + Serialized output handler for benchmark reports in JSON or YAML formats. + + This output handler persists generative benchmark reports to the file system in + either JSON or YAML format. It supports flexible path specification, allowing + users to provide either a directory (where a default filename will be generated) + or an explicit file path for the serialized report output. + + Example: + :: + output = GenerativeBenchmarkerSerialized(output_path="/path/to/output.json") + result_path = await output.finalize(report) + """ + + output_path: Path = Field( + default_factory=lambda: Path.cwd(), + description="Directory or file path for saving the serialized report", + ) + + @classmethod + def validated_kwargs( + cls, output_path: str | Path | None, **_kwargs + ) -> dict[str, Any]: + """ + Validate and normalize output path keyword arguments. + + :param output_path: Directory or file path for serialization output + :param _kwargs: Additional keyword arguments (ignored) + :return: Dictionary of validated keyword arguments for class initialization + """ + validated: dict[str, Any] = {} + if output_path is not None: + validated["output_path"] = ( + Path(output_path) if not isinstance(output_path, Path) else output_path + ) + return validated + + async def finalize(self, report: GenerativeBenchmarksReport) -> Path: + """ + Serialize and save the benchmark report to the configured output path. + + :param report: The generative benchmarks report to serialize + :return: Path to the saved report file + """ + return report.save_file(self.output_path) diff --git a/src/guidellm/benchmark/profile.py b/src/guidellm/benchmark/profile.py index 4b3f36fd..dc0a30d0 100644 --- a/src/guidellm/benchmark/profile.py +++ b/src/guidellm/benchmark/profile.py @@ -33,11 +33,10 @@ ConstraintInitializer, ConstraintsInitializerFactory, SchedulingStrategy, - StrategyType, SynchronousStrategy, ThroughputStrategy, ) -from guidellm.utils import PydanticClassRegistryMixin +from guidellm.schemas import PydanticClassRegistryMixin if TYPE_CHECKING: from guidellm.benchmark.schemas import Benchmark @@ -56,7 +55,7 @@ class Profile( - PydanticClassRegistryMixin["type[Profile]"], + PydanticClassRegistryMixin["Profile"], ABC, ): """ @@ -74,6 +73,7 @@ class Profile( @classmethod def __pydantic_schema_base_type__(cls) -> type[Profile]: + """Return the base type for polymorphic validation hierarchy.""" if cls.__name__ == "Profile": return cls @@ -97,7 +97,10 @@ def create( :return: Configured profile instance for the specified type :raises ValueError: If rate_type is not registered """ - profile_class: type[Profile] = cls.get_registered_object(rate_type) + profile_class = cls.get_registered_object(rate_type) + if profile_class is None: + raise ValueError(f"Profile type '{rate_type}' is not registered") + resolved_kwargs = profile_class.resolve_args( rate_type=rate_type, rate=rate, random_seed=random_seed, **kwargs ) @@ -138,7 +141,7 @@ def resolve_args( @computed_field # type: ignore[misc] @property - def strategy_types(self) -> list[StrategyType]: + def strategy_types(self) -> list[str]: """ :return: Strategy types executed or expected to execute in this profile """ @@ -147,10 +150,7 @@ def strategy_types(self) -> list[StrategyType]: def strategies_generator( self, ) -> Generator[ - tuple[ - SchedulingStrategy | None, - dict[str, Any | dict[str, Any] | Constraint] | None, - ], + tuple[SchedulingStrategy, dict[str, Constraint] | None], Benchmark | None, None, ]: @@ -196,7 +196,7 @@ def next_strategy_constraints( next_strategy: SchedulingStrategy | None, prev_strategy: SchedulingStrategy | None, prev_benchmark: Benchmark | None, - ) -> dict[str, Any | dict[str, Any] | Constraint] | None: + ) -> dict[str, Constraint] | None: """ Generate constraints for the next strategy execution. @@ -225,14 +225,16 @@ def _constraints_validator( return { key: ( - val - if not isinstance(val, ConstraintInitializer) - else ConstraintsInitializerFactory.deserialize(initializer_dict=val) + ConstraintsInitializerFactory.deserialize(initializer_dict=val) + if isinstance(val, dict) + and "type_" in val + and not isinstance(val, ConstraintInitializer) + else val ) for key, val in value.items() } - @field_serializer + @field_serializer("constraints") def _constraints_serializer( self, constraints: dict[str, Any | dict[str, Any] | ConstraintInitializer] | None, @@ -281,7 +283,7 @@ def resolve_args( return kwargs @property - def strategy_types(self) -> list[StrategyType]: + def strategy_types(self) -> list[str]: """ :return: Single synchronous strategy type """ @@ -346,7 +348,7 @@ def resolve_args( return kwargs @property - def strategy_types(self) -> list[StrategyType]: + def strategy_types(self) -> list[str]: """ :return: Concurrent strategy types for each configured stream count """ @@ -419,7 +421,7 @@ def resolve_args( return kwargs @property - def strategy_types(self) -> list[StrategyType]: + def strategy_types(self) -> list[str]: """ :return: Single throughput strategy type """ @@ -510,7 +512,7 @@ def resolve_args( return kwargs @property - def strategy_types(self) -> list[StrategyType]: + def strategy_types(self) -> list[str]: """ :return: Async strategy types for each configured rate """ @@ -622,7 +624,7 @@ def resolve_args( return kwargs @property - def strategy_types(self) -> list[StrategyType]: + def strategy_types(self) -> list[str]: """ :return: Strategy types for the complete sweep sequence """ @@ -637,8 +639,8 @@ def next_strategy( ) -> ( AsyncConstantStrategy | AsyncPoissonStrategy - | SynchronousProfile - | ThroughputProfile + | SynchronousStrategy + | ThroughputStrategy | None ): """ @@ -656,9 +658,7 @@ def next_strategy( return SynchronousStrategy() if prev_strategy.type_ == "synchronous": - self.synchronous_rate = prev_benchmark.get_request_metrics_sample()[ - "request_throughput" - ] + self.synchronous_rate = prev_benchmark.request_throughput.successful.mean return ThroughputStrategy( max_concurrency=self.max_concurrency, @@ -666,9 +666,7 @@ def next_strategy( ) if prev_strategy.type_ == "throughput": - self.throughput_rate = prev_benchmark.get_request_metrics_sample()[ - "request_throughput" - ] + self.throughput_rate = prev_benchmark.request_throughput.successful.mean if self.synchronous_rate <= 0 and self.throughput_rate <= 0: raise RuntimeError( "Invalid rates in sweep; aborting. " diff --git a/src/guidellm/benchmark/progress.py b/src/guidellm/benchmark/progress.py index 558def67..2e6c3274 100644 --- a/src/guidellm/benchmark/progress.py +++ b/src/guidellm/benchmark/progress.py @@ -1,17 +1,11 @@ """ -Benchmark progress tracking and console display abstractions. +Progress tracking and console display for benchmark execution monitoring. -Provides progress tracking interfaces and implementations for monitoring benchmark -execution, displaying real-time statistics, and managing UI updates during -generative benchmarking operations. - -Classes: - BenchmarkerProgress: Abstract base for benchmark progress tracking. - BenchmarkerProgressGroup: Composite progress handler for multiple instances. - GenerativeConsoleBenchmarkerProgress: Console-based progress display. - -Type Variables: - BenchmarkT: Generic benchmark object type. +Provides abstract interfaces and concrete implementations for tracking benchmark +progress during execution. The module enables real-time display of benchmark +statistics, metrics, and execution state through console-based UI components. +Primary use cases include monitoring generative benchmark runs with detailed +request/token statistics and scheduler state updates. """ from __future__ import annotations @@ -37,93 +31,92 @@ from guidellm.benchmark.profile import Profile from guidellm.benchmark.schemas import ( + BenchmarkAccumulatorT, BenchmarkT, - EstimatedBenchmarkState, GenerativeBenchmark, + GenerativeBenchmarkAccumulator, ) -from guidellm.scheduler import SchedulerState, SchedulingStrategy, StrategyType +from guidellm.scheduler import SchedulerState, SchedulingStrategy from guidellm.utils import Colors, format_value_display __all__ = ["BenchmarkerProgress", "GenerativeConsoleBenchmarkerProgress"] -class BenchmarkerProgress(Generic[BenchmarkT], ABC): +class BenchmarkerProgress(Generic[BenchmarkAccumulatorT, BenchmarkT], ABC): """ - Abstract base class for tracking and displaying benchmark progress. + Abstract interface for tracking and displaying benchmark execution progress. - Provides lifecycle hooks for monitoring benchmark execution stages including - initialization, start, updates, completion, and finalization. Supports - enable/disable functionality for conditional progress tracking. + Provides lifecycle hooks for monitoring benchmark stages including initialization, + execution start, progress updates, completion, and finalization. Implementations + handle display updates, progress tracking, and resource management for benchmark + monitoring. """ def __init__(self): - """ - Initialize progress tracker. - - :param enabled: Whether to enable progress tracking and display. - """ - self.profile: Profile = None - self.current_strategy: SchedulingStrategy = None + """Initialize progress tracker with default state.""" + self.profile: Profile | None = None + self.current_strategy: SchedulingStrategy | None = None @abstractmethod async def on_initialize(self, profile: Profile): """ - Initialize progress tracking for benchmark profile. + Initialize progress tracking for the given benchmark profile. - :param profile: Benchmark profile configuration. + :param profile: Benchmark profile configuration defining execution parameters """ @abstractmethod async def on_benchmark_start(self, strategy: SchedulingStrategy): """ - Handle start of new benchmark strategy execution. + Handle benchmark strategy execution start event. - :param strategy: Scheduling strategy being executed. + :param strategy: Scheduling strategy configuration being executed """ @abstractmethod async def on_benchmark_update( - self, estimated_state: EstimatedBenchmarkState, scheduler_state: SchedulerState + self, accumulator: BenchmarkAccumulatorT, scheduler_state: SchedulerState ): """ - Handle benchmark execution progress update. + Handle benchmark execution progress update with current metrics. - :param estimated_state: Current benchmark metrics and statistics. - :param scheduler_state: Current scheduler execution state. + :param accumulator: Current accumulated benchmark metrics and statistics + :param scheduler_state: Current scheduler execution state and counters """ @abstractmethod async def on_benchmark_complete(self, benchmark: BenchmarkT): """ - Handle completion of benchmark strategy execution. + Handle benchmark strategy execution completion event. - :param benchmark: Completed benchmark results. + :param benchmark: Completed benchmark results with final metrics """ @abstractmethod async def on_finalize(self): - """Finalize progress tracking and cleanup resources.""" + """Finalize progress tracking and release associated resources.""" class GenerativeConsoleBenchmarkerProgress( - BenchmarkerProgress[GenerativeBenchmark], Live + BenchmarkerProgress[GenerativeBenchmarkAccumulator, GenerativeBenchmark], Live ): """ - Console-based progress display for generative benchmarks. + Console-based real-time progress display for generative benchmarks. - Provides real-time visual progress tracking using Rich library components, - displaying benchmark execution statistics, timing information, and progress - bars in a structured console interface. + Renders live benchmark execution statistics using Rich library components with + structured progress bars, timing information, request/token metrics, and optional + scheduler statistics. Updates refresh automatically during benchmark execution. + + :cvar display_scheduler_stats: Whether to include scheduler statistics in display """ def __init__(self, display_scheduler_stats: bool = False): """ - Initialize console progress display. + Initialize console progress display with rendering configuration. - :param enabled: Whether to enable progress tracking and display. - :param display_scheduler_stats: Whether to display scheduler statistics. + :param display_scheduler_stats: Whether to display scheduler timing statistics """ - BenchmarkerProgress.__init__(self) + super().__init__() Live.__init__( self, refresh_per_second=4, @@ -132,15 +125,15 @@ def __init__(self, display_scheduler_stats: bool = False): redirect_stderr=True, ) self.display_scheduler_stats: bool = display_scheduler_stats - self.run_progress: Progress = None - self.run_progress_task: TaskID = None - self.tasks_progress: _GenerativeProgressTasks = None + self.run_progress: Progress | None = None + self.run_progress_task: TaskID | None = None + self.tasks_progress: _GenerativeProgressTasks | None = None async def on_initialize(self, profile: Profile): """ - Initialize console display components and start rendering. + Initialize console display components and begin live rendering. - :param profile: Benchmark profile configuration. + :param profile: Benchmark profile configuration defining execution parameters """ self.tasks_progress = _GenerativeProgressTasks( profile=profile, display_scheduler_stats=self.display_scheduler_stats @@ -179,41 +172,46 @@ async def on_initialize(self, profile: Profile): async def on_benchmark_start(self, strategy: SchedulingStrategy): """ - Update display for new benchmark strategy start. + Update display for benchmark strategy execution start. - :param strategy: Scheduling strategy being executed. + :param strategy: Scheduling strategy configuration being executed """ - self.tasks_progress.start_benchmark(strategy) - self._sync_run_progress() + if self.tasks_progress is not None: + self.tasks_progress.start_benchmark(strategy) + self._sync_run_progress() async def on_benchmark_update( self, - aggregator_update: EstimatedBenchmarkState | None, + accumulator: GenerativeBenchmarkAccumulator, scheduler_state: SchedulerState, ): """ - Update display with current benchmark progress. + Update display with current benchmark progress and metrics. - :param aggregator_update: Current benchmark metrics and statistics. - :param scheduler_state: Current scheduler execution state. + :param accumulator: Current accumulated benchmark metrics and statistics + :param scheduler_state: Current scheduler execution state and counters """ - self.tasks_progress.update_benchmark(aggregator_update, scheduler_state) - self._sync_run_progress() + if self.tasks_progress is not None: + self.tasks_progress.update_benchmark(accumulator, scheduler_state) + self._sync_run_progress() async def on_benchmark_complete(self, benchmark: GenerativeBenchmark): """ - Update display for completed benchmark. + Update display for completed benchmark strategy. - :param benchmark: Completed benchmark results. + :param benchmark: Completed benchmark results with final metrics """ - self.tasks_progress.complete_benchmark(benchmark) - self._sync_run_progress() + if self.tasks_progress is not None: + self.tasks_progress.complete_benchmark(benchmark) + self._sync_run_progress() async def on_finalize(self): - """Stop display rendering and cleanup resources.""" - self.tasks_progress.finalize() - self._sync_run_progress() - self.run_progress.stop_task(self.run_progress_task) + """Stop display rendering and release resources.""" + if self.tasks_progress is not None: + self.tasks_progress.finalize() + self._sync_run_progress() + if self.run_progress is not None and self.run_progress_task is not None: + self.run_progress.stop_task(self.run_progress_task) self.stop() self.run_progress = None self.run_progress_task = None @@ -221,13 +219,18 @@ async def on_finalize(self): def _sync_run_progress(self): """Synchronize overall progress display with task progress.""" - self.run_progress.update( - self.run_progress_task, - total=self.tasks_progress.steps_total, - completed=self.tasks_progress.steps_progress, - completed_benchmarks=self.tasks_progress.tasks_progress, - total_benchmarks=self.tasks_progress.tasks_total, - ) + if ( + self.run_progress is not None + and self.run_progress_task is not None + and self.tasks_progress is not None + ): + self.run_progress.update( + self.run_progress_task, + total=self.tasks_progress.steps_total, + completed=self.tasks_progress.steps_progress, + completed_benchmarks=self.tasks_progress.tasks_progress, + total_benchmarks=self.tasks_progress.tasks_total, + ) # Scaling factor for progress calculations to provide granular progress updates @@ -283,7 +286,7 @@ def steps_progress(self) -> int: ) progress_total = self.current_index + (progress_current_task or 0) - return progress_total * _PROGRESS_SCALE + return int(progress_total * _PROGRESS_SCALE) def start_benchmark(self, strategy: SchedulingStrategy): self.current_index += 1 @@ -294,32 +297,36 @@ def start_benchmark(self, strategy: SchedulingStrategy): task_state.task_id = task_id self.benchmark_task_states.append(task_state) - self.benchmark_task_states[self.current_index].start(strategy) - self.update( - self.benchmark_task_states[self.current_index].task_id, - start=True, - **self.benchmark_task_states[self.current_index].current, - ) + current_state = self.benchmark_task_states[self.current_index] + current_state.start(strategy) + if current_state.task_id is not None: + self.update( + current_state.task_id, + start=True, + **current_state.current, + ) def update_benchmark( self, - aggregator_update: EstimatedBenchmarkState, + accumulator: GenerativeBenchmarkAccumulator, scheduler_state: SchedulerState, ): - self.benchmark_task_states[self.current_index].update( - aggregator_update, scheduler_state - ) - self.update( - self.benchmark_task_states[self.current_index].task_id, - **self.benchmark_task_states[self.current_index].current, - ) + current_state = self.benchmark_task_states[self.current_index] + current_state.update(accumulator, scheduler_state) + if current_state.task_id is not None: + self.update( + current_state.task_id, + **current_state.current, + ) def complete_benchmark(self, benchmark: GenerativeBenchmark): - self.benchmark_task_states[self.current_index].complete(benchmark) - self.update( - self.benchmark_task_states[self.current_index].task_id, - **self.benchmark_task_states[self.current_index].current, - ) + current_state = self.benchmark_task_states[self.current_index] + current_state.complete(benchmark) + if current_state.task_id is not None: + self.update( + current_state.task_id, + **current_state.current, + ) def finalize(self): self.stop() @@ -327,29 +334,29 @@ def finalize(self): @dataclass class _GenerativeProgressTaskState: - strategy_type: StrategyType - task_id: TaskID = None + strategy_type: str + task_id: TaskID | None = None strategy: SchedulingStrategy | None = None benchmark_status: Literal[ - "pending", "in_warmup", "in_progress", "in_cooldown", "completed" + "pending", "warmup", "active", "cooldown", "completed" ] = "pending" progress: float | None = None start_time: float = -1.0 successful_requests: int = 0 cancelled_requests: int = 0 errored_requests: int = 0 - request_concurrency: int = 0 - requests_per_second: float = 0 - request_latency: float = 0 - output_tokens: int = 0 - output_tokens_rate: float = 0 - prompt_tokens: int = 0 - total_tokens_rate: float = 0 - time_to_first_token: float = 0 - inter_token_latency: float = 0 - queued_time: float = 0 - request_targeted_start_delay: float = 0 - scheduler_overheads_time: float = 0 + request_concurrency: float = 0.0 + requests_per_second: float = 0.0 + request_latency: float = 0.0 + output_tokens: float = 0 + output_tokens_rate: float = 0.0 + prompt_tokens: float = 0 + total_tokens_rate: float = 0.0 + time_to_first_token: float = 0.0 + inter_token_latency: float = 0.0 + queued_time: float = 0.0 + request_targeted_start_delay: float = 0.0 + scheduler_overheads_time: float = 0.0 @property def current(self) -> dict[str, Any]: @@ -367,12 +374,12 @@ def current(self) -> dict[str, Any]: @property def completed(self) -> float: if self.benchmark_status == "pending": - return 0 + return 0.0 if self.benchmark_status == "completed": - return _PROGRESS_SCALE + return float(_PROGRESS_SCALE) - return self.progress * _PROGRESS_SCALE if self.progress is not None else None + return self.progress * _PROGRESS_SCALE if self.progress is not None else 0.0 @property def total(self) -> float: @@ -387,13 +394,13 @@ def formatted_start_time(self) -> str: @property def formatted_progress_status(self) -> str: - if self.benchmark_status == "in_warmup": + if self.benchmark_status == "warmup": status = "warmup" color = Colors.progress - elif self.benchmark_status == "in_progress": + elif self.benchmark_status == "active": status = "running" color = Colors.progress - elif self.benchmark_status == "in_cooldown": + elif self.benchmark_status == "cooldown": status = "cooldown" color = Colors.progress elif self.benchmark_status == "completed": @@ -560,7 +567,7 @@ def start(self, strategy: SchedulingStrategy): def update( self, - estimated_state: EstimatedBenchmarkState, + accumulator: GenerativeBenchmarkAccumulator, scheduler_state: SchedulerState, ): self.progress = ( @@ -569,76 +576,40 @@ def update( else 0.0 ) self._update_processing_states( - benchmark_status=estimated_state.get_metric( - group=EstimatedBenchmarkState.benchmark_state_group, - key="status", - default=None, - ), - start_time=scheduler_state.start_time, + benchmark_status=self._map_status(accumulator.timings.status), + start_time=accumulator.timings.measure_start, successful_requests=scheduler_state.successful_requests, cancelled_requests=scheduler_state.cancelled_requests, errored_requests=scheduler_state.errored_requests, ) self._update_request_stats( - request_concurrency=estimated_state.get_metric( - group=EstimatedBenchmarkState.benchmark_metrics_group, - key="concurrency_requests", - ), - requests_per_second=estimated_state.get_metric( - group=EstimatedBenchmarkState.benchmark_metrics_group, - key="completed_requests_per_second", - ), - request_latency=estimated_state.get_metric( - group=EstimatedBenchmarkState.benchmark_metrics_group, - key="completed_request_latency", - ), + request_concurrency=accumulator.concurrency_metric.time_weighted_mean, + requests_per_second=accumulator.completed_metrics.requests.rate_per_second, + request_latency=accumulator.completed_metrics.request_latency.mean, ) self._update_token_stats( - output_tokens=estimated_state.get_metric( - group=EstimatedBenchmarkState.benchmark_metrics_group, - key="completed_output_tokens_total", - ), - output_tokens_rate=estimated_state.get_metric( - group=EstimatedBenchmarkState.benchmark_metrics_group, - key="completed_output_tokens", - ), - prompt_tokens=estimated_state.get_metric( - group=EstimatedBenchmarkState.benchmark_metrics_group, - key="completed_input_tokens_total", - ), - total_tokens_rate=estimated_state.get_metric( - group=EstimatedBenchmarkState.benchmark_metrics_group, - key="completed_total_tokens", - ), - time_to_first_token=estimated_state.get_metric( - group=EstimatedBenchmarkState.benchmark_metrics_group, - key="completed_time_to_first_token", - ), - inter_token_latency=estimated_state.get_metric( - group=EstimatedBenchmarkState.benchmark_metrics_group, - key="completed_inter_token_latency", - ), + output_tokens=accumulator.completed_metrics.total_tokens.mean, + output_tokens_rate=accumulator.completed_metrics.output_tokens.rate_per_second, + prompt_tokens=accumulator.completed_metrics.input_tokens.mean, + total_tokens_rate=accumulator.completed_metrics.total_tokens.rate_per_second, + time_to_first_token=accumulator.completed_metrics.time_to_first_token_ms.mean, + inter_token_latency=accumulator.completed_metrics.inter_token_latency_ms.mean, + converted=True, + ) + self._update_system_stats( + request_targeted_start_delay=accumulator.scheduler_metrics.request_targeted_start_delay.mean, + queued_time=accumulator.scheduler_metrics.queued_time.mean, + scheduler_overheads_time=accumulator.scheduler_metrics.resolve_end_delay.mean, + converted=False, ) - if estimated_state.get("updated_scheduler_stats"): - self._update_system_stats( - request_targeted_start_delay=estimated_state.get_metric( - group=EstimatedBenchmarkState.scheduler_state_group, - key="request_targeted_start_delay", - ), - queued_time=estimated_state.get_metric( - group=EstimatedBenchmarkState.scheduler_state_group, - key="queued_time", - ), - scheduler_overheads_time=0.0, # Need to add up metrics here - ) def complete(self, benchmark: GenerativeBenchmark): self._update_processing_states( benchmark_status="completed", start_time=benchmark.start_time, - successful_requests=benchmark.request_totals.successful, - cancelled_requests=benchmark.request_totals.incomplete, - errored_requests=benchmark.request_totals.errored, + successful_requests=benchmark.metrics.request_totals.successful, + cancelled_requests=benchmark.metrics.request_totals.incomplete, + errored_requests=benchmark.metrics.request_totals.errored, ) self._update_request_stats( request_concurrency=benchmark.metrics.request_concurrency.successful.mean, @@ -659,11 +630,19 @@ def complete(self, benchmark: GenerativeBenchmark): converted=True, ) + @staticmethod + def _map_status( + status: Literal["pending", "warmup", "active", "cooldown"], + ) -> Literal["pending", "warmup", "active", "cooldown", "completed"]: + """Map accumulator status to internal progress status representation.""" + return status + def _update_processing_states( self, benchmark_status: Literal[ - "pending", "in_warmup", "in_progress", "in_cooldown", "completed" - ], + "pending", "warmup", "active", "cooldown", "completed" + ] + | None = None, start_time: float | None = None, successful_requests: int | None = None, cancelled_requests: int | None = None, @@ -682,7 +661,7 @@ def _update_processing_states( def _update_request_stats( self, - request_concurrency: int | None = None, + request_concurrency: float | None = None, requests_per_second: float | None = None, request_latency: float | None = None, ): @@ -695,9 +674,9 @@ def _update_request_stats( def _update_token_stats( self, - output_tokens: int | None = None, + output_tokens: float | None = None, output_tokens_rate: float | None = None, - prompt_tokens: int | None = None, + prompt_tokens: float | None = None, total_tokens_rate: float | None = None, time_to_first_token: float | None = None, inter_token_latency: float | None = None, diff --git a/src/guidellm/benchmark/scenarios/chat.json b/src/guidellm/benchmark/scenarios/chat.json index 58fd18e2..a4137147 100644 --- a/src/guidellm/benchmark/scenarios/chat.json +++ b/src/guidellm/benchmark/scenarios/chat.json @@ -3,4 +3,4 @@ "data": [ "prompt_tokens=512,prompt_tokens_stdev=128,prompt_tokens_min=1,prompt_tokens_max=1024,output_tokens=256,output_tokens_stdev=64,output_tokens_min=1,output_tokens_max=1024" ] -} \ No newline at end of file +} diff --git a/src/guidellm/benchmark/scenarios/rag.json b/src/guidellm/benchmark/scenarios/rag.json index ea38d76e..0a82e9e9 100644 --- a/src/guidellm/benchmark/scenarios/rag.json +++ b/src/guidellm/benchmark/scenarios/rag.json @@ -3,4 +3,4 @@ "data": [ "prompt_tokens=4096,prompt_tokens_stdev=512,prompt_tokens_min=2048,prompt_tokens_max=6144,output_tokens=512,output_tokens_stdev=128,output_tokens_min=1,output_tokens_max=1024" ] -} \ No newline at end of file +} diff --git a/src/guidellm/benchmark/schemas.py b/src/guidellm/benchmark/schemas.py deleted file mode 100644 index b2fd15f5..00000000 --- a/src/guidellm/benchmark/schemas.py +++ /dev/null @@ -1,2128 +0,0 @@ -""" -Benchmark data models and metrics for generative AI performance measurement. - -Provides comprehensive data structures for capturing, storing, and analyzing -benchmark results from scheduler-driven generative AI workload executions. -Core abstractions include base benchmark interfaces, generative-specific -metrics with token/latency distributions, request-level statistics tracking, -and multi-benchmark reporting capabilities. These models enable detailed -performance analysis including throughput, latency, concurrency patterns, and -domain-specific metrics for text, image, video, and audio generation tasks. -""" - -from __future__ import annotations - -import inspect -import json -import random -import time -import uuid -from abc import ABC, abstractmethod -from collections.abc import Callable, Iterable -from pathlib import Path -from typing import Any, ClassVar, Literal, TypeVar, cast - -import yaml -from pydantic import ( - AliasChoices, - AliasGenerator, - ConfigDict, - Field, - ValidationError, - ValidatorFunctionWrapHandler, - computed_field, - field_validator, - model_serializer, -) -from torch.utils.data import Sampler -from transformers import PreTrainedTokenizerBase - -from guidellm.backends import Backend, BackendType -from guidellm.benchmark.profile import Profile, ProfileType -from guidellm.benchmark.scenarios import get_builtin_scenarios -from guidellm.data import DatasetPreprocessor -from guidellm.scheduler import ( - BackendInterface, - Environment, - SchedulerState, - SchedulingStrategy, - StrategyType, -) -from guidellm.schemas import ( - GenerationRequest, - GenerationResponse, - GenerativeRequestStats, - RequestInfo, - UsageMetrics, -) -from guidellm.utils import ( - InfoMixin, - StandardBaseDict, - StandardBaseModel, - StatusBreakdown, - StatusDistributionSummary, -) - -__all__ = [ - "Benchmark", - "BenchmarkGenerativeTextArgs", - "BenchmarkSchedulerStats", - "BenchmarkT", - "BenchmarkerArgs", - "BenchmarkerDict", - "EstimatedBenchmarkState", - "GenerativeAudioMetricsSummary", - "GenerativeBenchmark", - "GenerativeBenchmarksReport", - "GenerativeImageMetricsSummary", - "GenerativeMetrics", - "GenerativeMetricsSummary", - "GenerativeTextMetricsSummary", - "GenerativeVideoMetricsSummary", - "SchedulerDict", -] - - -class EstimatedBenchmarkState(dict[str, Any]): - """ - Accumulator for real-time benchmark metrics during scheduler execution. - - Tracks incremental metrics, running averages, and time-based statistics as - requests are processed. Maintains grouped metrics for benchmark state, - benchmark-level metrics, and scheduler-level metrics with support for - average, rate, and time-averaged metric calculations. - - :cvar benchmark_state_group: Metric group key for benchmark state tracking - :cvar benchmark_metrics_group: Metric group key for benchmark-level metrics - :cvar scheduler_state_group: Metric group key for scheduler-level metrics - """ - - benchmark_state_group: ClassVar[Literal["benchmark_state"]] = "benchmark_state" - benchmark_metrics_group: ClassVar[Literal["benchmark_metrics"]] = ( - "benchmark_metrics" - ) - scheduler_state_group: ClassVar[Literal["scheduler_state"]] = "scheduler_state" - - def get_metric( - self, - group: str, - key: str, - default: int | float | None = None, - ) -> int | float | None: - """ - Retrieve a grouped metric value by group and key. - - :param group: Metric group identifier - :param key: Metric key within the group - :param default: Value returned if metric doesn't exist - :return: The metric value or default if not found - """ - return self.get(f"{group}_{key}", default) - - def set_metric( - self, - group: str, - key: str, - value: bool | int | float | None, - start_val: bool | int | float | None = None, - ) -> bool | int | float | None: - """ - Set a grouped metric value, optionally adjusting by a starting value. - - :param group: Metric group identifier - :param key: Metric key within the group - :param value: Metric value to set - :param start_val: Optional starting value to subtract from the metric value - :return: The adjusted metric value or None if value is None - """ - if value is None: - return None - - if start_val is not None: - value -= start_val - self[f"{group}_{key}"] = value - - return value - - def add_avg_metric( - self, - group: str, - key: str, - value: bool | int | float | None, - start_val: bool | int | float | None = 0.0, - count: int | None = 1, - ): - """ - Add a value to a running average metric calculation. - - :param group: Metric group identifier - :param key: Metric key within the group - :param value: Value to add to the average - :param start_val: Optional starting value to subtract before adding - :param count: Number of observations this value represents - """ - if value is None or count is None: - return - - if start_val is not None: - value -= start_val - - total_key = f"{group}_{key}_total" - count_key = f"{group}_{key}_count" - self[total_key] = self.get(total_key, 0) + value - self[count_key] = self.get(count_key, 0) + count - - average = self[total_key] / self[count_key] if self[count_key] > 0 else 0.0 - self.set_metric( - group=group, - key=key, - value=average, - ) - - def add_avg_rate_metric( - self, - group: str, - key: str, - value: bool | int | float | None, - start_val: bool | int | float | None = 0.0, - start_time: float | None = None, - end_time: float | None = None, - numerator_type: Literal["avg", "total", "count"] = "total", - ): - """ - Add a value to a rate-based average metric calculation. - - :param group: Metric group identifier - :param key: Metric key within the group - :param value: Value to add to the average - :param start_val: Optional starting value to subtract before adding - :param start_time: Start time for rate calculation, defaults to current time - :param end_time: End time for rate calculation, defaults to current time - :param numerator_type: Type of numerator for rate calculation - """ - if value is None: - return - - self.add_avg_metric( - group=group, - key=key, - value=value, - start_val=start_val, - ) - start_time_key = f"{group}_{key}_start_time" - if self.get(start_time_key) is None: - if start_time is None: - start_time = time.time() - self[start_time_key] = start_time - else: - self[start_time_key] = start_time or self[start_time_key] - - end_time = end_time or time.time() - elapsed_time = end_time - self[start_time_key] - - if elapsed_time > 0: - numerator_key = ( - f"{group}_{key}_{numerator_type}" - if numerator_type != "avg" - else f"{group}_{key}" - ) - rate = self[numerator_key] / elapsed_time - self.set_metric( - group=group, - key=f"{key}_per_second", - value=rate, - ) - - def add_time_averaged_metric( - self, - group: str, - key: str, - value: bool | int | float | None, - recorded_time: float | None = None, - ): - """ - Add a value to a time-weighted average metric calculation. - - :param group: Metric group identifier - :param key: Metric key within the group - :param value: Value to add to the time-weighted average - :param recorded_time: Time of the observation, defaults to current time - """ - if value is None: - return - - if recorded_time is None: - recorded_time = time.time() - - time_avg_numerator_key = f"{group}_{key}_time_avg_numerator" - time_avg_denominator_key = f"{group}_{key}_time_avg_denominator" - last_recorded_time_key = f"{group}_{key}_last_recorded_time" - last_recorded_value_key = f"{group}_{key}_last_recorded_value" - - if last_recorded_time_key not in self: - self[last_recorded_time_key] = recorded_time - self[last_recorded_value_key] = value - self[time_avg_numerator_key] = value - self[time_avg_denominator_key] = 0.0 - else: - time_delta = recorded_time - self[last_recorded_time_key] - self[time_avg_numerator_key] += self[last_recorded_value_key] * time_delta - self[time_avg_denominator_key] += time_delta - self[last_recorded_time_key] = recorded_time - self[last_recorded_value_key] = value - - if self[time_avg_denominator_key] > 0: - average = self[time_avg_numerator_key] / self[time_avg_denominator_key] - else: - average = value - - self.set_metric( - group=group, - key=key, - value=average, - ) - - -class BenchmarkerArgs(StandardBaseDict): - """ - Configuration parameters for benchmark execution and request sampling. - - Defines run identification, request sampling strategy, warmup/cooldown phases, - and metric preferences for benchmark executions. Provides methods to determine - whether a request falls within warmup or cooldown periods based on time, - request count, or percentage-based thresholds. - """ - - run_id: str = Field( - default_factory=lambda: str(uuid.uuid4()), - description="Unique identifier for the benchmark run", - ) - run_index: int = Field(default=0, description="Index of the benchmark run") - sample_requests: int | None = Field( - default=20, - description=( - "Number of requests to sample and keep in the final benchmark for metrics" - ), - ) - warmup: int | float | None = Field( - default=None, description="Warmup time before benchmarking starts" - ) - cooldown: int | float | None = Field( - default=None, description="Cooldown time after benchmarking ends" - ) - prefer_response_metrics: bool = Field( - default=True, - description="Whether to prefer response metrics over request metrics", - ) - - def is_in_warmup( - self, request_info: RequestInfo, scheduler_state: SchedulerState - ) -> bool: - """ - Check if a request is in the warmup phase. - - :param request_info: Information about the current request - :param scheduler_state: Current state of the scheduler - :return: True if the request is in warmup phase, False otherwise - """ - if self.warmup is not None and 0 < self.warmup < 1: - # Percentage-based warmup - return ( - scheduler_state.remaining_fraction is not None - and scheduler_state.remaining_fraction > (1 - self.warmup) - ) - - if self.warmup is not None and self.warmup > 1: - # Count/time-based warmup - if scheduler_state.processed_requests < self.warmup: - return True - - current_time = request_info.timings.targeted_start - return ( - current_time is not None - and (current_time - scheduler_state.start_time) < self.warmup - ) - - return False - - def is_in_cooldown( - self, request_info: RequestInfo, scheduler_state: SchedulerState - ) -> bool: - """ - Check if a request is in the cooldown phase. - - :param request_info: Information about the current request - :param scheduler_state: Current state of the scheduler - :return: True if the request is in cooldown phase, False otherwise - """ - if self.cooldown is not None and 0 < self.cooldown < 1: - # Percentage-based cooldown - return ( - scheduler_state.remaining_fraction is not None - and scheduler_state.remaining_fraction < self.cooldown - ) - - if self.cooldown is not None and self.cooldown > 1: - # Count/time-based cooldown - if ( - scheduler_state.remaining_requests is not None - and scheduler_state.remaining_requests <= self.cooldown - ): - return True - - current_time = ( - request_info.timings.resolve_end or request_info.timings.targeted_start - ) - return ( - current_time is not None - and scheduler_state.remaining_duration is not None - and scheduler_state.remaining_duration < self.cooldown - ) - - return False - - -class Benchmark(ABC): - """ - Abstract base interface for benchmark result implementations. - - Defines the contract for benchmark classes to provide run metrics sampling, - request metrics sampling, real-time estimate updates, and final compilation - of benchmark results from scheduler execution data. - """ - - @abstractmethod - def get_run_metrics_sample( - self, - ) -> dict[Literal["start_time", "end_time", "duration"], float]: - """ - Get a sample of run-level timing metrics. - - :return: Dictionary containing start_time, end_time, and duration metrics - """ - ... - - @abstractmethod - def get_request_metrics_sample( - self, - ) -> dict[ - Literal[ - "request_count", - "request_latency", - "request_throughput", - "request_concurrency", - ], - float, - ]: - """ - Get a sample of request-level performance metrics. - - :return: Dictionary containing request count, latency, throughput, and - concurrency metrics - """ - ... - - @classmethod - @abstractmethod - def update_estimate( - cls, - args: BenchmarkerArgs, - state: EstimatedBenchmarkState, - response: Any, - request: Any, - request_info: RequestInfo, - scheduler_state: SchedulerState, - ): - """ - Update real-time benchmark estimates with new request data. - - :param args: Benchmark configuration arguments - :param state: Current estimated benchmark state to update - :param response: Response received from the backend - :param request: Original request sent to the backend - :param request_info: Metadata about the request execution - :param scheduler_state: Current state of the scheduler - """ - ... - - @classmethod - @abstractmethod - def compile( - cls, - args: BenchmarkerArgs, - estimated_state: EstimatedBenchmarkState, - scheduler_state: SchedulerState, - profile: Profile, - requests: Iterable, - backend: BackendInterface, - environment: Environment, - strategy: SchedulingStrategy, - constraints: dict[str, dict[str, Any]], - ) -> Any: - """ - Compile final benchmark results from accumulated state. - - :param args: Benchmark configuration arguments - :param estimated_state: Accumulated benchmark state from execution - :param scheduler_state: Final state of the scheduler - :param profile: Benchmark profile configuration - :param requests: Collection of requests executed - :param backend: Backend interface used for execution - :param environment: Execution environment configuration - :param strategy: Scheduling strategy used - :param constraints: Execution constraints applied - :return: Compiled benchmark results instance - """ - ... - - -BenchmarkT = TypeVar("BenchmarkT", bound=Benchmark) - - -class BenchmarkSchedulerStats(StandardBaseDict): - """Scheduler timing and performance statistics.""" - - group_name: ClassVar[Literal["scheduler_stats"]] = "scheduler_stats" - - start_time: float = Field( - description="Unix timestamp when the benchmark run started" - ) - end_time: float = Field(description="Unix timestamp when the benchmark run ended") - requests_made: StatusBreakdown[int, int, int, int] = Field( - description="Request counts by status: successful, incomplete, errored, total" - ) - queued_time_avg: float = Field( - description="Avg time requests spent in the queue (seconds)" - ) - worker_resolve_start_delay_avg: float = Field( - description="Avg delay before worker begins resolving req after dequeue (sec)" - ) - worker_resolve_time_avg: float = Field( - description="Avg time for worker to resolve requests (seconds)" - ) - worker_resolve_end_delay_avg: float = Field( - description="Avg delay after request end till worker resolves (seconds)" - ) - finalized_delay_avg: float = Field( - description="Avg delay after resolve til finalized with in scheduler (sec)" - ) - worker_targeted_start_delay_avg: float = Field( - description="Avg delay from targeted start to actual worker start (seconds)" - ) - request_start_delay_avg: float = Field( - description="Avg delay after resolve til request start (seconds)" - ) - request_time_avg: float = Field(description="Avg request processing time (seconds)") - request_targeted_start_delay_avg: float = Field( - description="Avg delay from targeted start to actual request start" - ) - - @classmethod - def update_estimate(cls, state: EstimatedBenchmarkState, request_info: RequestInfo): - """ - Update estimated scheduler statistics with request timing information. - - :param state: Current estimated benchmark state to update - :param request_info: Metadata about the request execution with timing data - """ - state.set_metric(group=cls.group_name, key="updated", value=True) - state.add_avg_metric( - group=cls.group_name, - key="queued_time", - value=request_info.timings.dequeued, - start_val=request_info.timings.queued, - ) - state.add_avg_metric( - group=cls.group_name, - key="worker_resolve_start_delay", - value=request_info.timings.resolve_start, - start_val=request_info.timings.scheduled_at, - ) - state.add_avg_metric( - group=cls.group_name, - key="worker_resolve_time", - value=request_info.timings.resolve_end, - start_val=request_info.timings.resolve_start, - ) - state.add_avg_metric( - group=cls.group_name, - key="worker_resolve_end_delay", - value=request_info.timings.request_end, - start_val=request_info.timings.resolve_end, - ) - state.add_avg_metric( - group=cls.group_name, - key="finalized_delay", - value=request_info.timings.finalized, - start_val=request_info.timings.resolve_end, - ) - state.add_avg_metric( - group=cls.group_name, - key="worker_targeted_start_delay", - value=request_info.timings.resolve_start, - start_val=request_info.timings.targeted_start, - ) - state.add_avg_metric( - group=cls.group_name, - key="request_start_delay", - value=request_info.timings.request_start, - start_val=request_info.timings.resolve_start, - ) - state.add_avg_metric( - group=cls.group_name, - key="request_time", - value=request_info.timings.request_end, - start_val=request_info.timings.request_start, - ) - state.add_avg_metric( - group=cls.group_name, - key="request_targeted_start_delay", - value=request_info.timings.request_start, - start_val=request_info.timings.targeted_start, - ) - - @classmethod - def compile( - cls, estimated_state: EstimatedBenchmarkState, scheduler_state: SchedulerState - ) -> BenchmarkSchedulerStats: - """ - Compile final scheduler statistics from accumulated state. - - :param estimated_state: Accumulated benchmark state with scheduler metrics - :param scheduler_state: Final state of the scheduler - :return: Compiled scheduler statistics instance - """ - return BenchmarkSchedulerStats( - start_time=scheduler_state.start_time, - end_time=scheduler_state.end_time or scheduler_state.start_time, - requests_made=StatusBreakdown[int, int, int, int]( - successful=scheduler_state.successful_requests, - incomplete=scheduler_state.cancelled_requests, - errored=scheduler_state.errored_requests, - total=( - scheduler_state.successful_requests - + scheduler_state.cancelled_requests - + scheduler_state.errored_requests - ), - ), - queued_time_avg=cast( - "float", - estimated_state.get_metric( - group=cls.group_name, key="queued_time", default=-1.0 - ), - ), - worker_resolve_start_delay_avg=cast( - "float", - estimated_state.get_metric( - group=cls.group_name, key="worker_resolve_start_delay", default=-1.0 - ), - ), - worker_resolve_time_avg=cast( - "float", - estimated_state.get_metric( - group=cls.group_name, key="worker_resolve_time", default=-1.0 - ), - ), - worker_resolve_end_delay_avg=cast( - "float", - estimated_state.get_metric( - group=cls.group_name, key="worker_resolve_end_delay", default=-1.0 - ), - ), - finalized_delay_avg=cast( - "float", - estimated_state.get_metric( - group=cls.group_name, key="finalized_delay", default=-1.0 - ), - ), - worker_targeted_start_delay_avg=cast( - "float", - estimated_state.get_metric( - group=cls.group_name, - key="worker_targeted_start_delay", - default=-1.0, - ), - ), - request_start_delay_avg=cast( - "float", - estimated_state.get_metric( - group=cls.group_name, key="request_start_delay", default=-1.0 - ), - ), - request_time_avg=cast( - "float", - estimated_state.get_metric( - group=cls.group_name, key="request_time", default=-1.0 - ), - ), - request_targeted_start_delay_avg=cast( - "float", - estimated_state.get_metric( - group=cls.group_name, - key="request_targeted_start_delay", - default=-1.0, - ), - ), - ) - - -class GenerativeMetricsSummary(StandardBaseDict): - """ - Statistical summaries for input, output, and total metrics. - - Provides distribution summaries across successful, incomplete, and errored - requests for absolute values, per-second rates, and concurrency levels. - """ - - input: StatusDistributionSummary = Field( - description="Distribution of input metric values" - ) - input_per_second: StatusDistributionSummary = Field( - description="Distribution of input metric rates per second" - ) - input_concurrency: StatusDistributionSummary = Field( - description="Distribution of concurrent input metric values" - ) - - output: StatusDistributionSummary = Field( - description="Distribution of output metric values" - ) - output_per_second: StatusDistributionSummary = Field( - description="Distribution of output metric rates per second" - ) - output_concurrency: StatusDistributionSummary = Field( - description="Distribution of concurrent output metric values" - ) - - total: StatusDistributionSummary = Field( - description="Distribution of total metric values (input + output)" - ) - total_per_second: StatusDistributionSummary = Field( - description="Distribution of total metric rates per second" - ) - total_concurrency: StatusDistributionSummary = Field( - description="Distribution of concurrent total metric values" - ) - - @classmethod - def compile( - cls, - request_types: list[Literal["successful", "incomplete", "error"]], - request_times: list[tuple[float, float]], - input_values: list[int | float], - output_values: list[int | float], - ) -> GenerativeMetricsSummary: - """ - Compile generative metrics summary from request data. - - :param request_types: Status types for each request - :param request_times: Start and end times for each request - :param input_values: Input metric values for each request - :param output_values: Output metric values for each request - :return: Compiled generative metrics summary - """ - total_values = [ - input_val + output_val - for input_val, output_val in zip(input_values, output_values, strict=False) - ] - - return GenerativeMetricsSummary( - input=StatusDistributionSummary.from_values( - value_types=request_types, - values=input_values, - ), - input_per_second=StatusDistributionSummary.from_request_times( - request_types=request_types, - requests=request_times, - distribution_type="rate", - weights=input_values, - ), - input_concurrency=StatusDistributionSummary.from_request_times( - request_types=request_types, - requests=request_times, - distribution_type="concurrency", - weights=input_values, - ), - output=StatusDistributionSummary.from_values( - value_types=request_types, - values=output_values, - ), - output_per_second=StatusDistributionSummary.from_request_times( - request_types=request_types, - requests=request_times, - distribution_type="rate", - weights=output_values, - ), - output_concurrency=StatusDistributionSummary.from_request_times( - request_types=request_types, - requests=request_times, - distribution_type="concurrency", - weights=output_values, - ), - total=StatusDistributionSummary.from_values( - value_types=request_types, - values=total_values, - ), - total_per_second=StatusDistributionSummary.from_request_times( - request_types=request_types, - requests=request_times, - distribution_type="rate", - weights=total_values, - ), - total_concurrency=StatusDistributionSummary.from_request_times( - request_types=request_types, - requests=request_times, - distribution_type="concurrency", - weights=total_values, - ), - ) - - -class GenerativeTextMetricsSummary(StandardBaseDict): - """ - Text-specific metric summaries for generative benchmarks. - - Tracks token, word, and character-level metrics across input, output, and - total usage for text generation workloads. - """ - - tokens: GenerativeMetricsSummary = Field( - description="Token count metrics and distributions" - ) - words: GenerativeMetricsSummary = Field( - description="Word count metrics and distributions" - ) - characters: GenerativeMetricsSummary = Field( - description="Character count metrics and distributions" - ) - - @classmethod - def compile( - cls, - request_types: list[Literal["successful", "incomplete", "error"]], - request_times: list[tuple[float, float]], - input_metrics: list[UsageMetrics], - output_metrics: list[UsageMetrics], - ) -> GenerativeTextMetricsSummary: - """ - Compile text metrics summary from request usage data. - - :param request_types: Status types for each request - :param request_times: Start and end times for each request - :param input_metrics: Input usage metrics for each request - :param output_metrics: Output usage metrics for each request - :return: Compiled text metrics summary - """ - return GenerativeTextMetricsSummary( - tokens=GenerativeMetricsSummary.compile( - request_types=request_types, - request_times=request_times, - input_values=[metrics.text_tokens or 0 for metrics in input_metrics], - output_values=[metrics.text_tokens or 0 for metrics in output_metrics], - ), - words=GenerativeMetricsSummary.compile( - request_types=request_types, - request_times=request_times, - input_values=[metrics.text_words or 0 for metrics in input_metrics], - output_values=[metrics.text_words or 0 for metrics in output_metrics], - ), - characters=GenerativeMetricsSummary.compile( - request_types=request_types, - request_times=request_times, - input_values=[ - metrics.text_characters or 0 for metrics in input_metrics - ], - output_values=[ - metrics.text_characters or 0 for metrics in output_metrics - ], - ), - ) - - -class GenerativeImageMetricsSummary(StandardBaseDict): - """ - Image-specific metric summaries for generative benchmarks. - - Tracks token, image count, pixel, and byte-level metrics across input, output, - and total usage for image generation workloads. - """ - - tokens: GenerativeMetricsSummary = Field( - description="Image token count metrics and distributions" - ) - images: GenerativeMetricsSummary = Field( - description="Image count metrics and distributions" - ) - pixels: GenerativeMetricsSummary = Field( - description="Pixel count metrics and distributions" - ) - bytes: GenerativeMetricsSummary = Field( - description="Byte size metrics and distributions" - ) - - @classmethod - def compile( - cls, - request_types: list[Literal["successful", "incomplete", "error"]], - request_times: list[tuple[float, float]], - input_metrics: list[UsageMetrics], - output_metrics: list[UsageMetrics], - ) -> GenerativeImageMetricsSummary: - """ - Compile image metrics summary from request usage data. - - :param request_types: Status types for each request - :param request_times: Start and end times for each request - :param input_metrics: Input usage metrics for each request - :param output_metrics: Output usage metrics for each request - :return: Compiled image metrics summary - """ - return GenerativeImageMetricsSummary( - tokens=GenerativeMetricsSummary.compile( - request_types=request_types, - request_times=request_times, - input_values=[metrics.image_tokens or 0 for metrics in input_metrics], - output_values=[metrics.image_tokens or 0 for metrics in output_metrics], - ), - images=GenerativeMetricsSummary.compile( - request_types=request_types, - request_times=request_times, - input_values=[metrics.image_count or 0 for metrics in input_metrics], - output_values=[metrics.image_count or 0 for metrics in output_metrics], - ), - pixels=GenerativeMetricsSummary.compile( - request_types=request_types, - request_times=request_times, - input_values=[metrics.image_pixels or 0 for metrics in input_metrics], - output_values=[metrics.image_pixels or 0 for metrics in output_metrics], - ), - bytes=GenerativeMetricsSummary.compile( - request_types=request_types, - request_times=request_times, - input_values=[metrics.image_bytes or 0 for metrics in input_metrics], - output_values=[metrics.image_bytes or 0 for metrics in output_metrics], - ), - ) - - -class GenerativeVideoMetricsSummary(StandardBaseDict): - """ - Video-specific metric summaries for generative benchmarks. - - Tracks token, frame count, duration, and byte-level metrics across input, - output, and total usage for video generation workloads. - """ - - tokens: GenerativeMetricsSummary = Field( - description="Video token count metrics and distributions" - ) - frames: GenerativeMetricsSummary = Field( - description="Frame count metrics and distributions" - ) - seconds: GenerativeMetricsSummary = Field( - description="Duration metrics in seconds and distributions" - ) - bytes: GenerativeMetricsSummary = Field( - description="Byte size metrics and distributions" - ) - - @classmethod - def compile( - cls, - request_types: list[Literal["successful", "incomplete", "error"]], - request_times: list[tuple[float, float]], - input_metrics: list[UsageMetrics], - output_metrics: list[UsageMetrics], - ) -> GenerativeVideoMetricsSummary: - """ - Compile video metrics summary from request usage data. - - :param request_types: Status types for each request - :param request_times: Start and end times for each request - :param input_metrics: Input usage metrics for each request - :param output_metrics: Output usage metrics for each request - :return: Compiled video metrics summary - """ - return GenerativeVideoMetricsSummary( - tokens=GenerativeMetricsSummary.compile( - request_types=request_types, - request_times=request_times, - input_values=[metrics.video_tokens or 0 for metrics in input_metrics], - output_values=[metrics.video_tokens or 0 for metrics in output_metrics], - ), - frames=GenerativeMetricsSummary.compile( - request_types=request_types, - request_times=request_times, - input_values=[metrics.video_frames or 0 for metrics in input_metrics], - output_values=[metrics.video_frames or 0 for metrics in output_metrics], - ), - seconds=GenerativeMetricsSummary.compile( - request_types=request_types, - request_times=request_times, - input_values=[metrics.video_seconds or 0 for metrics in input_metrics], - output_values=[ - metrics.video_seconds or 0 for metrics in output_metrics - ], - ), - bytes=GenerativeMetricsSummary.compile( - request_types=request_types, - request_times=request_times, - input_values=[metrics.video_bytes or 0 for metrics in input_metrics], - output_values=[metrics.video_bytes or 0 for metrics in output_metrics], - ), - ) - - -class GenerativeAudioMetricsSummary(StandardBaseDict): - """ - Audio-specific metric summaries for generative benchmarks. - - Tracks token, sample count, duration, and byte-level metrics across input, - output, and total usage for audio generation workloads. - """ - - tokens: GenerativeMetricsSummary = Field( - description="Audio token count metrics and distributions" - ) - samples: GenerativeMetricsSummary = Field( - description="Sample count metrics and distributions" - ) - seconds: GenerativeMetricsSummary = Field( - description="Duration metrics in seconds and distributions" - ) - bytes: GenerativeMetricsSummary = Field( - description="Byte size metrics and distributions" - ) - - @classmethod - def compile( - cls, - request_types: list[Literal["successful", "incomplete", "error"]], - request_times: list[tuple[float, float]], - input_metrics: list[UsageMetrics], - output_metrics: list[UsageMetrics], - ) -> GenerativeAudioMetricsSummary: - """ - Compile audio metrics summary from request usage data. - - :param request_types: Status types for each request - :param request_times: Start and end times for each request - :param input_metrics: Input usage metrics for each request - :param output_metrics: Output usage metrics for each request - :return: Compiled audio metrics summary - """ - return GenerativeAudioMetricsSummary( - tokens=GenerativeMetricsSummary.compile( - request_types=request_types, - request_times=request_times, - input_values=[metrics.audio_tokens or 0 for metrics in input_metrics], - output_values=[metrics.audio_tokens or 0 for metrics in output_metrics], - ), - samples=GenerativeMetricsSummary.compile( - request_types=request_types, - request_times=request_times, - input_values=[metrics.audio_samples or 0 for metrics in input_metrics], - output_values=[ - metrics.audio_samples or 0 for metrics in output_metrics - ], - ), - seconds=GenerativeMetricsSummary.compile( - request_types=request_types, - request_times=request_times, - input_values=[metrics.audio_seconds or 0 for metrics in input_metrics], - output_values=[ - metrics.audio_seconds or 0 for metrics in output_metrics - ], - ), - bytes=GenerativeMetricsSummary.compile( - request_types=request_types, - request_times=request_times, - input_values=[metrics.audio_bytes or 0 for metrics in input_metrics], - output_values=[metrics.audio_bytes or 0 for metrics in output_metrics], - ), - ) - - -class GenerativeMetrics(StandardBaseDict): - """Comprehensive metrics for generative AI benchmarks.""" - - # Request stats - requests_per_second: StatusDistributionSummary = Field( - description="Distribution of requests per second across benchmark execution" - ) - request_concurrency: StatusDistributionSummary = Field( - description="Distribution of concurrent request counts during execution" - ) - request_latency: StatusDistributionSummary = Field( - description="Distribution of request latencies for completed requests" - ) - request_streaming_iterations_count: StatusDistributionSummary = Field( - description="Distribution of stream iterations for completed requests" - ) - - # General token stats - prompt_token_count: StatusDistributionSummary = Field( - description="Distribution of prompt token counts by request status" - ) - output_token_count: StatusDistributionSummary = Field( - description="Distribution of output token counts by request status" - ) - total_token_count: StatusDistributionSummary = Field( - description="Distribution of total token counts by request status" - ) - time_to_first_token_ms: StatusDistributionSummary = Field( - description="Distribution of first token latencies in milliseconds" - ) - time_per_output_token_ms: StatusDistributionSummary = Field( - description="Distribution of average time per output token in milliseconds" - ) - inter_token_latency_ms: StatusDistributionSummary = Field( - description="Distribution of inter-token latencies in milliseconds" - ) - output_tokens_wo_first_per_iteration: StatusDistributionSummary = Field( - description=( - "Distribution of output tokens (without first) generated per " - "streaming iteration" - ) - ) - output_tokens_per_second: StatusDistributionSummary = Field( - description="Distribution of output token generation rates" - ) - output_tokens_per_iteration: StatusDistributionSummary = Field( - description="Distribution of output tokens generated per streaming iteration" - ) - tokens_per_second: StatusDistributionSummary = Field( - description="Distribution of total token throughput including prompt and output" - ) - - # Domain specific stats - text: GenerativeTextMetricsSummary = Field( - description="Text-specific metrics for tokens, words, and characters" - ) - image: GenerativeImageMetricsSummary = Field( - description="Image-specific metrics for tokens, images, pixels, and bytes" - ) - video: GenerativeVideoMetricsSummary = Field( - description="Video-specific metrics for tokens, frames, duration, and bytes" - ) - audio: GenerativeAudioMetricsSummary = Field( - description="Audio-specific metrics for tokens, samples, duration, and bytes" - ) - - @classmethod - def update_estimate( - cls, - state: EstimatedBenchmarkState, - response: GenerationResponse | None, - request: GenerationRequest, - request_info: RequestInfo, - scheduler_state: SchedulerState, - ): - """ - Update real-time generative metrics estimates with new request data. - - :param state: Current estimated benchmark state to update - :param response: Response received from the backend - :param request: Original request sent to the backend - :param request_info: Metadata about the request execution - :param scheduler_state: Current state of the scheduler - """ - benchmark_start_time = scheduler_state.start_time - request_start_time = ( - request_info.timings.request_start or request_info.timings.resolve_start - ) - request_end_time = ( - request_info.timings.request_end or request_info.timings.resolve_end - ) - event_occurence_time = ( - request_info.timings.queued - if request_info.status == "queued" - else ( - request_info.timings.dequeued - if request_info.status == "pending" - else request_start_time - if request_info.status == "in_progress" - else request_end_time - ) - ) - benchmark_duration = ( - event_occurence_time - benchmark_start_time - if event_occurence_time - else None - ) - request_duration = ( - (request_end_time - request_start_time) - if request_end_time and request_start_time - else None - ) - - # Always track concurrency - if event_occurence_time is not None: - state.add_time_averaged_metric( - group=EstimatedBenchmarkState.benchmark_metrics_group, - key="concurrency_requests", - value=scheduler_state.processing_requests, - recorded_time=event_occurence_time, - ) - - if request_info.status not in {"completed", "errored", "cancelled"}: - return - - state.set_metric( - group=EstimatedBenchmarkState.benchmark_metrics_group, - key="updated", - value=True, - ) - - for prefix in (request_info.status, "total"): - requests_count = ( - scheduler_state.successful_requests - if prefix == "completed" - else scheduler_state.errored_requests - if prefix == "errored" - else scheduler_state.cancelled_requests - if prefix == "cancelled" - else scheduler_state.processed_requests - ) - input_tokens = ( - (response.input_metrics.total_tokens if response else None) - or request.input_metrics.total_tokens - or 0 - ) - output_tokens = ( - (response.output_metrics.total_tokens if response else None) - or request.output_metrics.total_tokens - or 0 - ) - - # Request distribution stats - state.set_metric( - group=EstimatedBenchmarkState.benchmark_metrics_group, - key=f"{prefix}_requests", - value=requests_count, - ) - state.set_metric( - group=EstimatedBenchmarkState.benchmark_metrics_group, - key=f"{prefix}_requests_per_second", - value=( - requests_count / benchmark_duration if benchmark_duration else None - ), - ) - state.add_avg_metric( - group=EstimatedBenchmarkState.benchmark_metrics_group, - key=f"{prefix}_request_latency", - value=request_duration, - ) - state.add_avg_metric( - group=EstimatedBenchmarkState.benchmark_metrics_group, - key=f"{prefix}_request_streaming_iterations", - value=request_info.timings.iterations or 0, - ) - - # Token iteration stats - state.add_avg_metric( - group=EstimatedBenchmarkState.benchmark_metrics_group, - key="output_tokens_iterations", - value=output_tokens, - count=request_info.timings.iterations or 1, - ) - state.add_avg_metric( - group=EstimatedBenchmarkState.benchmark_metrics_group, - key="output_tokens_wo_first_iterations", - value=output_tokens - 1 if output_tokens > 1 else 0, - count=request_info.timings.iterations or 1, - ) - - # Token metrics stats - state.add_avg_metric( - group=EstimatedBenchmarkState.benchmark_metrics_group, - key=f"{prefix}_time_to_first_token", - value=request_info.timings.first_iteration, - start_val=request_start_time, - ) - state.add_avg_metric( - group=EstimatedBenchmarkState.benchmark_metrics_group, - key=f"{prefix}_inter_token_latency", - value=request_info.timings.last_iteration, - start_val=request_info.timings.first_iteration, - count=(output_tokens or 1) - 1, - ) - state.add_avg_metric( - group=EstimatedBenchmarkState.benchmark_metrics_group, - key=f"{prefix}_time_per_output_token", - value=request_duration, - count=output_tokens or 0, - ) - - # Input/output throughput stats - if event_occurence_time is not None: - state.add_avg_rate_metric( - group=EstimatedBenchmarkState.benchmark_metrics_group, - key="input_tokens", - value=input_tokens, - start_time=benchmark_start_time, - end_time=event_occurence_time, - ) - state.add_avg_rate_metric( - group=EstimatedBenchmarkState.benchmark_metrics_group, - key="output_tokens", - value=output_tokens, - start_time=benchmark_start_time, - end_time=event_occurence_time, - ) - state.add_avg_rate_metric( - group=EstimatedBenchmarkState.benchmark_metrics_group, - key="total_tokens", - value=input_tokens + output_tokens, - start_time=benchmark_start_time, - end_time=event_occurence_time, - ) - state.add_avg_rate_metric( - group=EstimatedBenchmarkState.benchmark_metrics_group, - key="input_text_tokens", - value=( - (response.input_metrics.text_tokens if response else None) - or request.input_metrics.text_tokens - or 0 - ), - start_time=benchmark_start_time, - end_time=event_occurence_time, - ) - state.add_avg_rate_metric( - group=EstimatedBenchmarkState.benchmark_metrics_group, - key="input_images", - value=( - (response.input_metrics.image_count if response else None) - or request.input_metrics.image_count - or 0 - ), - start_time=benchmark_start_time, - end_time=event_occurence_time, - ) - state.add_avg_rate_metric( - group=EstimatedBenchmarkState.benchmark_metrics_group, - key="input_video_frames", - value=( - (response.input_metrics.video_frames if response else None) - or request.input_metrics.video_frames - or 0 - ), - start_time=benchmark_start_time, - end_time=event_occurence_time, - ) - state.add_avg_rate_metric( - group=EstimatedBenchmarkState.benchmark_metrics_group, - key="input_audio_seconds", - value=request.input_metrics.audio_seconds or 0, - start_time=benchmark_start_time, - end_time=event_occurence_time, - ) - - @classmethod - def compile( - cls, - completed: list[GenerativeRequestStats], - errored: list[GenerativeRequestStats], - incomplete: list[GenerativeRequestStats], - ) -> GenerativeMetrics: - """ - Compile final generative metrics from request statistics. - - :param completed: Successfully completed request statistics - :param errored: Failed request statistics - :param incomplete: Incomplete/cancelled request statistics - :return: Compiled generative metrics with full distributions - """ - requests = completed + errored + incomplete - request_types = cast( - "list[Literal['successful', 'error', 'incomplete']]", - ["successful"] * len(completed) - + ["error"] * len(errored) - + ["incomplete"] * len(incomplete), - ) - request_times = [ - ( - req.info.timings.request_start or req.info.timings.resolve_start or 0, - req.info.timings.request_end or req.info.timings.resolve_end or 0, - ) - for req in requests - ] - input_metrics = [req.input_metrics for req in requests] - output_metrics = [req.output_metrics for req in requests] - - return GenerativeMetrics( - # Request stats - requests_per_second=StatusDistributionSummary.from_request_times( - request_types=request_types, - requests=request_times, - distribution_type="rate", - ), - request_concurrency=StatusDistributionSummary.from_request_times( - request_types=request_types, - requests=request_times, - distribution_type="concurrency", - ), - request_latency=StatusDistributionSummary.from_values( - value_types=request_types, - values=[req.request_latency or 0.0 for req in requests], - ), - request_streaming_iterations_count=StatusDistributionSummary.from_values( - value_types=request_types, - values=[float(req.info.timings.iterations or 0) for req in requests], - ), - # General token stats - prompt_token_count=StatusDistributionSummary.from_values( - value_types=request_types, - values=[float(req.prompt_tokens or 0) for req in requests], - ), - output_token_count=StatusDistributionSummary.from_values( - value_types=request_types, - values=[float(req.output_tokens or 0) for req in requests], - ), - total_token_count=StatusDistributionSummary.from_values( - value_types=request_types, - values=[float(req.total_tokens or 0) for req in requests], - ), - time_to_first_token_ms=StatusDistributionSummary.from_values( - value_types=request_types, - values=[req.time_to_first_token_ms or 0.0 for req in requests], - ), - time_per_output_token_ms=StatusDistributionSummary.from_values( - value_types=request_types, - values=[req.time_per_output_token_ms or 0.0 for req in requests], - ), - inter_token_latency_ms=StatusDistributionSummary.from_values( - value_types=request_types, - values=[req.inter_token_latency_ms or 0.0 for req in requests], - ), - output_tokens_wo_first_per_iteration=StatusDistributionSummary.from_values( - value_types=request_types, - values=[ - max(0.0, (req.output_metrics.total_tokens or 1.0) - 1.0) - for req in requests - ], - weights=[req.info.timings.iterations or 1 for req in requests], - ), - output_tokens_per_second=StatusDistributionSummary.from_values( - value_types=request_types, - values=[req.output_tokens_per_second or 0.0 for req in requests], - ), - output_tokens_per_iteration=StatusDistributionSummary.from_values( - value_types=request_types, - values=[req.output_tokens_per_iteration or 0.0 for req in requests], - weights=[req.info.timings.iterations or 1 for req in requests], - ), - tokens_per_second=StatusDistributionSummary.from_values( - value_types=request_types, - values=[req.tokens_per_second or 0.0 for req in requests], - ), - # Domain-specific stats - text=GenerativeTextMetricsSummary.compile( - request_types=request_types, - request_times=request_times, - input_metrics=input_metrics, - output_metrics=output_metrics, - ), - image=GenerativeImageMetricsSummary.compile( - request_types=request_types, - request_times=request_times, - input_metrics=input_metrics, - output_metrics=output_metrics, - ), - video=GenerativeVideoMetricsSummary.compile( - request_types=request_types, - request_times=request_times, - input_metrics=input_metrics, - output_metrics=output_metrics, - ), - audio=GenerativeAudioMetricsSummary.compile( - request_types=request_types, - request_times=request_times, - input_metrics=input_metrics, - output_metrics=output_metrics, - ), - ) - - -class SchedulerDict(StandardBaseDict): - """Scheduler configuration and execution state dictionary.""" - - strategy: SchedulingStrategy = Field( - description="Scheduling strategy used for request distribution" - ) - constraints: dict[str, dict[str, Any]] = Field( - description="Execution constraints applied during benchmarking" - ) - state: SchedulerState = Field( - description="Final state of the scheduler after execution" - ) - - -class BenchmarkerDict(StandardBaseDict): - """Benchmarker configuration and component settings dictionary.""" - - profile: Profile = Field(description="Benchmark profile configuration") - requests: dict[str, Any] = Field( - description="Request configuration and dataset information" - ) - backend: dict[str, Any] = Field( - description="Backend configuration and connection details" - ) - environment: dict[str, Any] = Field( - description="Execution environment configuration" - ) - - -class GenerativeBenchmark(Benchmark, StandardBaseDict): - """Complete generative AI benchmark results with specialized metrics.""" - - group_name: ClassVar[Literal["generative_benchmark"]] = "generative_benchmark" - - type_: Literal["generative_benchmark"] = "generative_benchmark" # type: ignore[assignment] - id_: str = Field( - default_factory=lambda: str(uuid.uuid4()), - description="Unique identifier for this benchmark execution", - ) - run_id: str = Field( - description="Identifier for the benchmarker run containing this benchmark" - ) - run_index: int = Field( - description="Sequential index of this benchmark within the benchmarker run" - ) - scheduler: SchedulerDict = Field( - description="Scheduler configuration and execution state" - ) - benchmarker: BenchmarkerDict = Field( - description="Benchmarker configuration and component settings" - ) - run_stats: BenchmarkSchedulerStats = Field( - description="Scheduler timing and performance statistics" - ) - start_time: float = Field( - default=-1.0, description="Unix timestamp when the first request was initiated" - ) - end_time: float = Field( - default=-1.0, description="Unix timestamp when the last request completed" - ) - - def get_run_metrics_sample( - self, - ) -> dict[Literal["start_time", "end_time", "duration"], float]: - return { - "start_time": self.start_time, - "end_time": self.end_time, - "duration": self.duration, - } - - def get_request_metrics_sample( - self, - ) -> dict[ - Literal[ - "request_count", - "request_latency", - "request_throughput", - "request_concurrency", - ], - float, - ]: - return { - "request_count": self.request_totals.successful, - "request_latency": self.metrics.request_latency.successful.mean, - "request_throughput": self.metrics.requests_per_second.successful.mean, - "request_concurrency": self.metrics.request_concurrency.successful.mean, - } - - @computed_field # type: ignore[misc] - @property - def duration(self) -> float: - """ - Benchmark execution duration in seconds. - - :return: Time elapsed from first request start to last request completion. - """ - return self.end_time - self.start_time - - metrics: GenerativeMetrics = Field( - description="Performance metrics and statistical distributions" - ) - request_totals: StatusBreakdown[int, int, int, int] = Field( - description="Request counts by status: successful, incomplete, errored, total" - ) - requests: StatusBreakdown[ - list[GenerativeRequestStats], - list[GenerativeRequestStats], - list[GenerativeRequestStats], - None, - ] = Field( - description="Request details grouped by status: successful, incomplete, errored" - ) - - @classmethod - def update_estimate( - cls, - args: BenchmarkerArgs, - state: EstimatedBenchmarkState, - response: GenerationResponse | None, - request: GenerationRequest, - request_info: RequestInfo, - scheduler_state: SchedulerState, - ): - """ - Update generative benchmark estimates with new request data. - - Handles warmup/cooldown filtering, request sampling via reservoir sampling, - and delegates metric updates to child metric classes. - - :param args: Benchmark configuration arguments - :param state: Current estimated benchmark state to update - :param response: Response received from the backend - :param request: Original request sent to the backend - :param request_info: Metadata about the request execution - :param scheduler_state: Current state of the scheduler - """ - if ( - request_info.status == "cancelled" - and request_info.timings.resolve_start is None - ): - # Cancelled requests that never started should be ignored - return - - # Update child metric groups - BenchmarkSchedulerStats.update_estimate(state, request_info) - GenerativeMetrics.update_estimate( - state, response, request, request_info, scheduler_state - ) - - # Store requests and sampling info, update counts - if "requests_completed" not in state: - state["requests_completed"] = [] - state["samples_completed"] = [] - state["requests_errored"] = [] - state["samples_errored"] = [] - state["requests_incomplete"] = [] - state["samples_incomplete"] = [] - in_warmup = state.set_metric( - group=EstimatedBenchmarkState.benchmark_state_group, - key="in_warmup", - value=args.is_in_warmup(request_info, scheduler_state), - ) - in_cooldown = state.set_metric( - group=EstimatedBenchmarkState.benchmark_state_group, - key="in_cooldown", - value=args.is_in_cooldown(request_info, scheduler_state), - ) - state[f"{EstimatedBenchmarkState.benchmark_state_group}_status"] = ( - "in_cooldown" - if in_cooldown - else "in_warmup" - if in_warmup - else "in_progress" - ) - - if ( - request_info.status not in {"completed", "errored", "cancelled"} - or in_warmup - or in_cooldown - ): - # Must be fully resolved to be added - return - - state.set_metric( - group=EstimatedBenchmarkState.benchmark_state_group, - key="updated", - value=True, - ) - - if response is None: - response = GenerationResponse( - request_id=request.request_id, request_args=str(request.arguments) - ) - - stats = response.compile_stats( - request, request_info, args.prefer_response_metrics - ) - - # Determine status and get corresponding lists - if request_info.status == "completed": - requests_list = state["requests_completed"] - samples_list = state["samples_completed"] - elif request_info.status == "errored": - requests_list = state["requests_errored"] - samples_list = state["samples_errored"] - else: # cancelled (incomplete) - requests_list = state["requests_incomplete"] - samples_list = state["samples_incomplete"] - - # Add to requests list - requests_list.append(stats) - current_index = len(requests_list) - 1 - - # Handle request sampling logic - if args.sample_requests is None: - # No sampling, add index to samples list - samples_list.append(current_index) - elif args.sample_requests > 0 and len(samples_list) < args.sample_requests: - # Space in samples list, add index - samples_list.append(current_index) - elif ( - args.sample_requests > 0 - and (replace_index := random.randrange(len(requests_list))) - < args.sample_requests - ): - # No space, adding based on reservoir sampling - samples_list[replace_index] = current_index - # Sampling set to 0, don't keep any requests - - @classmethod - def compile( - cls, - args: BenchmarkerArgs, - estimated_state: EstimatedBenchmarkState, - scheduler_state: SchedulerState, - profile: Profile, - requests: Iterable, # noqa: ARG003 - backend: BackendInterface, - environment: Environment, - strategy: SchedulingStrategy, - constraints: dict[str, dict[str, Any]], - data: list[Any], - ) -> GenerativeBenchmark: - """ - Compile final generative benchmark from accumulated state. - - :param args: Benchmark configuration arguments - :param estimated_state: Accumulated benchmark state from execution - :param scheduler_state: Final state of the scheduler - :param profile: Benchmark profile configuration - :param requests: Collection of requests executed - :param backend: Backend interface used for execution - :param environment: Execution environment configuration - :param strategy: Scheduling strategy used - :param constraints: Execution constraints applied - :return: Compiled generative benchmark instance - """ - return GenerativeBenchmark( - run_id=args.run_id, - run_index=args.run_index, - scheduler=SchedulerDict( - strategy=strategy, - constraints={ - key: InfoMixin.extract_from_obj(val) - for key, val in constraints.items() - }, - state=scheduler_state, - ), - benchmarker=BenchmarkerDict( - profile=profile, - requests={"data": data}, - backend=backend.info, - environment=environment.info, - ), - run_stats=BenchmarkSchedulerStats.compile(estimated_state, scheduler_state), - start_time=scheduler_state.start_time or -1.0, - end_time=scheduler_state.end_time or -1.0, - metrics=GenerativeMetrics.compile( - completed=estimated_state.get("requests_completed", []), - errored=estimated_state.get("requests_errored", []), - incomplete=estimated_state.get("requests_incomplete", []), - ), - request_totals=StatusBreakdown[int, int, int, int]( - successful=len(estimated_state.get("requests_completed", [])), - incomplete=len(estimated_state.get("requests_incomplete", [])), - errored=len(estimated_state.get("requests_errored", [])), - total=( - len(estimated_state.get("requests_completed", [])) - + len(estimated_state.get("requests_incomplete", [])) - + len(estimated_state.get("requests_errored", [])) - ), - ), - requests=StatusBreakdown[ - list[GenerativeRequestStats], - list[GenerativeRequestStats], - list[GenerativeRequestStats], - None, - ]( - successful=estimated_state.get("requests_completed", []), - incomplete=estimated_state.get("requests_incomplete", []), - errored=estimated_state.get("requests_errored", []), - total=None, - ), - ) - - -class BenchmarkGenerativeTextArgs(StandardBaseModel): - """ - Configuration arguments for generative text benchmark execution. - - Defines all parameters for benchmark setup including target endpoint, data - sources, backend configuration, processing pipeline, output formatting, and - execution constraints. Supports loading from scenario files and merging with - runtime overrides. - """ - - @classmethod - def create( - cls, scenario: Path | str | None, **kwargs: dict[str, Any] - ) -> BenchmarkGenerativeTextArgs: - """ - Create benchmark args from scenario file and/or keyword arguments. - - :param scenario: Path to scenario file or name of built-in scenario - :param kwargs: Additional keyword arguments to override scenario values - :return: Configured benchmark args instance - :raises ValueError: If scenario is not found or file format is unsupported - """ - constructor_kwargs = {} - - if scenario is not None: - if isinstance(scenario, str) and scenario in ( - builtin_scenarios := get_builtin_scenarios() - ): - scenario_path = builtin_scenarios[scenario] - elif Path(scenario).exists() and Path(scenario).is_file(): - scenario_path = Path(scenario) - else: - raise ValueError(f"Scenario '{scenario}' not found.") - - with scenario_path.open() as file: - if scenario_path.suffix == ".json": - scenario_data = json.load(file) - elif scenario_path.suffix in {".yaml", ".yml"}: - scenario_data = yaml.safe_load(file) - else: - raise ValueError( - f"Unsupported scenario file format: {scenario_path.suffix}" - ) - if "args" in scenario_data: - # loading from a report file - scenario_data = scenario_data["args"] - constructor_kwargs.update(scenario_data) - - # Apply overrides from kwargs - constructor_kwargs.update(kwargs) - - return cls.model_validate(constructor_kwargs) - - @classmethod - def get_default(cls: type[BenchmarkGenerativeTextArgs], field: str) -> Any: - """ - Get default value for a model field. - - :param field: Name of the field to retrieve default for - :return: Default value for the specified field - :raises ValueError: If field is not found in model - """ - if field not in BenchmarkGenerativeTextArgs.model_fields: - raise ValueError( - f"Field '{field}' not found in BenchmarkGenerativeTextArgs" - ) - - field_info = BenchmarkGenerativeTextArgs.model_fields[field] - factory = field_info.default_factory - - if factory is None: - return field_info.default - - if len(inspect.signature(factory).parameters) == 0: - return factory() # type: ignore[call-arg] # Confirmed correct at runtime by code above - else: - return factory({}) # type: ignore[call-arg] # Confirmed correct at runtime by code above - - model_config = ConfigDict( - extra="ignore", - use_enum_values=True, - from_attributes=True, - arbitrary_types_allowed=True, - validate_by_alias=True, - validate_by_name=True, - alias_generator=AliasGenerator( - # Support field names with hyphens - validation_alias=lambda field_name: AliasChoices( - field_name, field_name.replace("_", "-") - ), - ), - ) - - # Required - target: str = Field(description="Target endpoint URL for benchmark execution") - data: list[Any] = Field( - description="List of dataset sources or data files", - default_factory=list, - min_length=1, - ) - # Benchmark configuration - profile: StrategyType | ProfileType | Profile = Field( - default="sweep", description="Benchmark profile or scheduling strategy type" - ) - rate: list[float] | None = Field( - default=None, description="Request rate(s) for rate-based scheduling" - ) - # Backend configuration - backend: BackendType | Backend = Field( - default="openai_http", description="Backend type or instance for execution" - ) - backend_kwargs: dict[str, Any] | None = Field( - default=None, description="Additional backend configuration arguments" - ) - model: str | None = Field(default=None, description="Model identifier for backend") - # Data configuration - processor: str | Path | PreTrainedTokenizerBase | None = Field( - default=None, description="Tokenizer path, name, or instance for processing" - ) - processor_args: dict[str, Any] | None = Field( - default=None, description="Additional tokenizer configuration arguments" - ) - data_args: list[dict[str, Any]] | None = Field( - default_factory=list, description="Per-dataset configuration arguments" - ) - data_samples: int = Field( - default=-1, description="Number of samples to use from datasets (-1 for all)" - ) - data_column_mapper: ( - DatasetPreprocessor | dict[str, str] | Literal["generative_column_mapper"] - ) = Field( - default="generative_column_mapper", - description="Column mapping preprocessor for dataset fields", - ) - data_request_formatter: DatasetPreprocessor | dict[str, str] | str = Field( - default="chat_completions", - description="Request formatting preprocessor or template name", - validation_alias=AliasChoices( - "data_request_formatter", - "data-request-formatter", - "request_type", - "request-type", - ), - ) - data_collator: Callable | Literal["generative"] | None = Field( - default="generative", description="Data collator for batch processing" - ) - data_sampler: Sampler[int] | Literal["shuffle"] | None = Field( - default=None, description="Data sampler for request ordering" - ) - data_num_workers: int | None = Field( - default=None, description="Number of workers for data loading" - ) - dataloader_kwargs: dict[str, Any] | None = Field( - default=None, description="Additional dataloader configuration arguments" - ) - random_seed: int = Field(default=42, description="Random seed for reproducibility") - # Output configuration - output_path: str | Path | None = Field( - default_factory=Path.cwd, description="Directory path for output files" - ) - output_formats: list[str] | dict[str, str | dict[str, Any]] | None = Field( - default_factory=lambda: ["console", "json"], - description="Output format names or configuration mappings", - ) - # Benchmarker configuration - benchmark_cls: type[GenerativeBenchmark] = Field( - default=GenerativeBenchmark, - description="Benchmark class to use for result compilation", - ) - sample_requests: int | None = Field( - default=10, - description="Number of requests to sample for detailed metrics (None for all)", - ) - warmup: float | None = Field( - default=None, - description="Warmup period in seconds, requests, or fraction (0-1)", - ) - cooldown: float | None = Field( - default=None, - description="Cooldown period in seconds, requests, or fraction (0-1)", - ) - prefer_response_metrics: bool = Field( - default=True, - description="Whether to prefer backend response metrics over request metrics", - ) - # Constraints configuration - max_seconds: int | float | None = Field( - default=None, description="Maximum benchmark execution time in seconds" - ) - max_requests: int | None = Field( - default=None, description="Maximum number of requests to execute" - ) - max_errors: int | None = Field( - default=None, description="Maximum number of errors before stopping" - ) - max_error_rate: float | None = Field( - default=None, description="Maximum error rate (0-1) before stopping" - ) - max_global_error_rate: float | None = Field( - default=None, description="Maximum global error rate (0-1) before stopping" - ) - - @field_validator("data", "data_args", "rate", mode="wrap") - @classmethod - def single_to_list( - cls, value: Any, handler: ValidatorFunctionWrapHandler - ) -> list[Any]: - """ - Ensures field is always a list. - - :param value: Input value for the 'data' field - :return: List of data sources - """ - try: - return handler(value) - except ValidationError as err: - # If validation fails, try wrapping the value in a list - if err.errors()[0]["type"] == "list_type": - return handler([value]) - else: - raise - - @model_serializer - def serialize_model(self): - """ - Custom serialization logic for benchmark args. - - Converts complex types to serializable formats including Profile to type - string, Backend to type string, and Path objects to strings. - - :return: Dictionary representation suitable for JSON/YAML serialization - """ - return { - # target - serialize as is - "target": self.target, - "data": [ - item if isinstance(item, str | type(None)) else str(item) - for item in self.data - ], # data - for each item in the list, if not a str or None, save str(item) - "profile": ( - self.profile.type_ - if isinstance(self.profile, Profile) - else self.profile - ), # profile - if instance of Profile, then save as profile.type_ - "rate": self.rate, - "backend": ( - self.backend.type_ - if isinstance(self.backend, Backend) - else self.backend - ), # backend - if instance of Backend, then save as backend.type_ - "backend_kwargs": self.backend_kwargs, - "model": self.model, - "processor": ( - self.processor - if isinstance(self.processor, str) - else str(self.processor) - if self.processor is not None - else None - ), # processor - if not str, then save as str(processor) - "processor_args": self.processor_args, - "data_args": self.data_args, - "data_samples": self.data_samples, - "data_column_mapper": ( - self.data_column_mapper - if isinstance(self.data_column_mapper, dict | str) - else {} - ), # data_column_mapper - if not dict or str, then save as an empty dict - "data_request_formatter": ( - self.data_request_formatter - if isinstance(self.data_request_formatter, dict | str) - else {} - ), # data_request_formatter - if not dict or str, then save as empty dict - "data_collator": ( - self.data_collator if isinstance(self.data_collator, str) else None - ), # data_collator - if not str, then save as None - "data_sampler": ( - self.data_sampler if isinstance(self.data_sampler, str) else None - ), # data_sampler - if not str, then save as None - "data_num_workers": self.data_num_workers, - "dataloader_kwargs": self.dataloader_kwargs, - "random_seed": self.random_seed, - "output_path": ( - str(self.output_path) if self.output_path is not None else None - ), # output_path - if not None, then ensure it's a str - "output_formats": self.output_formats, - # benchmark_cls - don't save at all (excluded) - "sample_requests": self.sample_requests, - "warmup": self.warmup, - "cooldown": self.cooldown, - "prefer_response_metrics": self.prefer_response_metrics, - "max_seconds": self.max_seconds, - "max_requests": self.max_requests, - "max_errors": self.max_errors, - "max_error_rate": self.max_error_rate, - "max_global_error_rate": self.max_global_error_rate, - } - - -class GenerativeBenchmarksReport(StandardBaseModel): - """Container for multiple benchmark results with load/save functionality.""" - - DEFAULT_FILE: ClassVar[str] = "benchmarks.json" - - @staticmethod - def load_file( - path: str | Path, type_: Literal["json", "yaml"] | None = None - ) -> GenerativeBenchmarksReport: - """ - Load a report from a file. - - :param path: The path to load the report from. - :param type_: File type override, auto-detected from extension if None. - :return: The loaded report. - :raises ValueError: If file type is unsupported. - """ - path = Path(path) if not isinstance(path, Path) else path - - if path.is_dir(): - path = path / GenerativeBenchmarksReport.DEFAULT_FILE - - path.parent.mkdir(parents=True, exist_ok=True) - path_suffix = path.suffix.lower()[1:] - - with path.open("r") as file: - if (type_ or path_suffix) == "json": - model_dict = json.loads(file.read()) - elif (type_ or path_suffix) in ["yaml", "yml"]: - model_dict = yaml.safe_load(file) - else: - raise ValueError(f"Unsupported file type: {type_} for {path}.") - - return GenerativeBenchmarksReport.model_validate(model_dict) - - args: BenchmarkGenerativeTextArgs = Field( - description="The benchmark arguments used for all benchmarks in the report." - ) - benchmarks: list[GenerativeBenchmark] = Field( - description="The list of completed benchmarks contained within the report.", - default_factory=list, - ) - - def save_file( - self, path: str | Path | None, type_: Literal["json", "yaml"] | None = None - ) -> Path: - """ - Save the report to a file. - - :param path: The path to save the report to. - :param type_: File type override, auto-detected from extension if None. - :return: The path to the saved report. - :raises ValueError: If file type is unsupported. - """ - if path is None: - path = Path.cwd() - elif not isinstance(path, Path): - path = Path(path) - - if path.is_dir(): - path = path / GenerativeBenchmarksReport.DEFAULT_FILE - - path.parent.mkdir(parents=True, exist_ok=True) - path_suffix = path.suffix.lower()[1:] - model_dict = self.model_dump() - - if (type_ or path_suffix) == "json": - save_str = json.dumps(model_dict) - elif (type_ or path_suffix) in ["yaml", "yml"]: - save_str = yaml.dump(model_dict) - else: - raise ValueError(f"Unsupported file type: {type_} for {path}.") - - with path.open("w") as file: - file.write(save_str) - - return path diff --git a/src/guidellm/benchmark/schemas/__init__.py b/src/guidellm/benchmark/schemas/__init__.py new file mode 100644 index 00000000..fd0f5016 --- /dev/null +++ b/src/guidellm/benchmark/schemas/__init__.py @@ -0,0 +1,64 @@ +""" +Benchmark schemas for performance measurement and result analysis. + +This module consolidates the complete benchmark schema ecosystem, providing both +base abstractions for benchmark execution and domain-specific implementations +for generative AI tasks. It exports core configuration objects, accumulator +interfaces for real-time metric collection, benchmark result containers with +statistical summaries, and reporting utilities. The schemas support flexible +scheduling strategies, comprehensive metric tracking including latency and +throughput distributions, and multi-modal generative benchmarks for text, image, +video, and audio generation tasks. +""" + +from __future__ import annotations + +from .base import ( + Benchmark, + BenchmarkAccumulator, + BenchmarkAccumulatorT, + BenchmarkConfig, + BenchmarkT, +) +from .generative import ( + BenchmarkGenerativeTextArgs, + GenerativeAudioMetricsSummary, + GenerativeBenchmark, + GenerativeBenchmarkAccumulator, + GenerativeBenchmarksReport, + GenerativeBenchmarkTimings, + GenerativeImageMetricsSummary, + GenerativeMetrics, + GenerativeMetricsAccumulator, + GenerativeMetricsSummary, + GenerativeRequestsAccumulator, + GenerativeTextMetricsSummary, + GenerativeVideoMetricsSummary, + RunningMetricStats, + SchedulerMetrics, + SchedulerMetricsAccumulator, +) + +__all__ = [ + "Benchmark", + "BenchmarkAccumulator", + "BenchmarkAccumulatorT", + "BenchmarkConfig", + "BenchmarkGenerativeTextArgs", + "BenchmarkT", + "GenerativeAudioMetricsSummary", + "GenerativeBenchmark", + "GenerativeBenchmarkAccumulator", + "GenerativeBenchmarkTimings", + "GenerativeBenchmarksReport", + "GenerativeImageMetricsSummary", + "GenerativeMetrics", + "GenerativeMetricsAccumulator", + "GenerativeMetricsSummary", + "GenerativeRequestsAccumulator", + "GenerativeTextMetricsSummary", + "GenerativeVideoMetricsSummary", + "RunningMetricStats", + "SchedulerMetrics", + "SchedulerMetricsAccumulator", +] diff --git a/src/guidellm/benchmark/schemas/base.py b/src/guidellm/benchmark/schemas/base.py new file mode 100644 index 00000000..91e2fa95 --- /dev/null +++ b/src/guidellm/benchmark/schemas/base.py @@ -0,0 +1,190 @@ +""" +Core benchmark schemas for performance measurement and result analysis. + +Provides base classes and configuration for benchmark execution, including +accumulation of metrics during scheduler runs and compilation of final results. +Supports configurable scheduling strategies with comprehensive metric collection +for latency, throughput, and concurrency analysis. +""" + +from __future__ import annotations + +import uuid +from abc import ABC, abstractmethod +from typing import Any, Generic, TypeVar + +from pydantic import Field + +from guidellm.benchmark.profile import Profile +from guidellm.scheduler import ( + MultiTurnRequestT, + RequestT, + ResponseT, + SchedulerState, + SchedulingStrategy, +) +from guidellm.schemas import RequestInfo, StandardBaseDict, StatusDistributionSummary + +__all__ = [ + "Benchmark", + "BenchmarkAccumulator", + "BenchmarkAccumulatorT", + "BenchmarkConfig", + "BenchmarkT", +] + +BenchmarkAccumulatorT = TypeVar( + "BenchmarkAccumulatorT", bound="BenchmarkAccumulator[Any, Any]" +) + +BenchmarkT = TypeVar("BenchmarkT", bound="Benchmark") + + +class BenchmarkConfig(StandardBaseDict): + """ + Configuration parameters for benchmark execution. + + Encapsulates scheduler strategy, request sampling, warmup/cooldown phases, + and metric collection preferences for controlled benchmark runs. + """ + + id_: str = Field( + default_factory=lambda: str(uuid.uuid4()), + description="Unique identifier for this benchmark execution", + ) + run_id: str = Field( + description="Unique identifier for the benchmark run", + ) + run_index: int = Field( + description="Sequential index of this run within a benchmark series", + ) + strategy: SchedulingStrategy = Field( + description="Scheduling strategy for request execution", + ) + constraints: dict[str, dict[str, Any]] = Field( + description="Constraints applied to the scheduling strategy", + ) + sample_requests: int | None = Field( + default=20, + description="Number of requests to sample for final benchmark metrics", + ) + warmup: int | float | None = Field( + default=None, + description="Warmup period in seconds before benchmarking starts", + ) + cooldown: int | float | None = Field( + default=None, + description="Cooldown period in seconds after benchmarking ends", + ) + prefer_response_metrics: bool = Field( + default=True, + description="Whether to prioritize response metrics over request metrics", + ) + profile: Profile = Field( + description="Benchmark profile defining execution parameters", + ) + requests: dict[str, Any] = Field( + description="Request configuration and dataset information", + ) + backend: dict[str, Any] = Field( + description="Backend configuration and connection details", + ) + environment: dict[str, Any] = Field( + description="Execution environment configuration and metadata", + ) + + +class BenchmarkAccumulator(StandardBaseDict, ABC, Generic[RequestT, ResponseT]): + """ + Accumulates metrics and state during benchmark execution. + + Tracks benchmark progress by updating estimates as requests are processed, + enabling incremental metric collection during scheduler runs. + """ + + config: BenchmarkConfig = Field( + description="Configuration parameters for this benchmark execution", + ) + + @abstractmethod + def update_estimate( + self, + response: ResponseT | None, + request: RequestT | MultiTurnRequestT[RequestT], + info: RequestInfo, + scheduler_state: SchedulerState, + ): + """ + Update benchmark estimates with new request/response data. + + :param response: Response from the backend, if available + :param request: Request submitted to the backend + :param info: Metadata about request execution timing and status + :param scheduler_state: Current state of the scheduler + """ + ... + + +class Benchmark(StandardBaseDict, ABC, Generic[BenchmarkAccumulatorT]): + """ + Abstract base class for benchmark result implementations. + + Defines the interface for capturing execution metrics and compiling final results + from scheduler-driven workload executions, including request latency, throughput, + and concurrency distributions. + """ + + @property + @abstractmethod + def start_time(self) -> float: + """ + :return: Benchmark start time in seconds since epoch + """ + + @property + @abstractmethod + def end_time(self) -> float: + """ + :return: Benchmark end time in seconds since epoch + """ + + @property + @abstractmethod + def duration(self) -> float: + """ + :return: Total benchmark execution duration in seconds + """ + + @property + @abstractmethod + def request_latency(self) -> StatusDistributionSummary: + """ + :return: Distribution of request latencies across all processed requests + """ + + @property + @abstractmethod + def request_throughput(self) -> StatusDistributionSummary: + """ + :return: Distribution of request throughput across benchmark duration + """ + + @property + @abstractmethod + def request_concurrency(self) -> StatusDistributionSummary: + """ + :return: Distribution of concurrent requests across benchmark duration + """ + + @classmethod + @abstractmethod + def compile( + cls, accumulator: BenchmarkAccumulatorT, scheduler_state: SchedulerState + ) -> Any: + """ + Compile final benchmark results from accumulated metrics. + + :param accumulator: Accumulated benchmark state with request statistics + :param scheduler_state: Final state of the scheduler after execution + :return: Compiled benchmark instance with complete results + """ diff --git a/src/guidellm/benchmark/schemas/generative/__init__.py b/src/guidellm/benchmark/schemas/generative/__init__.py new file mode 100644 index 00000000..ad70fde0 --- /dev/null +++ b/src/guidellm/benchmark/schemas/generative/__init__.py @@ -0,0 +1,54 @@ +""" +Generative AI benchmark schemas for performance measurement and analysis. + +This module provides the complete schema ecosystem for executing, tracking, and +analyzing generative AI benchmarks. It encompasses configuration entrypoints for +benchmark setup, real-time metric accumulators for execution monitoring, +comprehensive result containers with statistical summaries, and multi-benchmark +reporting capabilities. The schemas support domain-specific metrics for text, +image, video, and audio generation tasks, enabling detailed performance analysis +including throughput, latency distributions, concurrency patterns, and scheduler +behavior tracking across successful, incomplete, and errored requests. +""" + +from __future__ import annotations + +from .accumulator import ( + GenerativeBenchmarkAccumulator, + GenerativeBenchmarkTimings, + GenerativeMetricsAccumulator, + GenerativeRequestsAccumulator, + RunningMetricStats, + SchedulerMetricsAccumulator, +) +from .benchmark import GenerativeBenchmark +from .entrypoints import BenchmarkGenerativeTextArgs +from .metrics import ( + GenerativeAudioMetricsSummary, + GenerativeImageMetricsSummary, + GenerativeMetrics, + GenerativeMetricsSummary, + GenerativeTextMetricsSummary, + GenerativeVideoMetricsSummary, + SchedulerMetrics, +) +from .report import GenerativeBenchmarksReport + +__all__ = [ + "BenchmarkGenerativeTextArgs", + "GenerativeAudioMetricsSummary", + "GenerativeBenchmark", + "GenerativeBenchmarkAccumulator", + "GenerativeBenchmarkTimings", + "GenerativeBenchmarksReport", + "GenerativeImageMetricsSummary", + "GenerativeMetrics", + "GenerativeMetricsAccumulator", + "GenerativeMetricsSummary", + "GenerativeRequestsAccumulator", + "GenerativeTextMetricsSummary", + "GenerativeVideoMetricsSummary", + "RunningMetricStats", + "SchedulerMetrics", + "SchedulerMetricsAccumulator", +] diff --git a/src/guidellm/benchmark/schemas/generative/accumulator.py b/src/guidellm/benchmark/schemas/generative/accumulator.py new file mode 100644 index 00000000..20ef08c0 --- /dev/null +++ b/src/guidellm/benchmark/schemas/generative/accumulator.py @@ -0,0 +1,847 @@ +""" +Real-time metric accumulation for generative benchmark execution. + +Captures and computes performance metrics during benchmark runs, tracking timing phases, +request statistics, token throughput, and latency distributions. Components include +timing trackers for warmup/cooldown phases, running statistical accumulators for +throughput and latency metrics, and reservoir sampling for request data. Enables +comprehensive performance measurement including scheduler overhead, time-to-first-token, +inter-token latency, and token generation rates across completed, errored, and +incomplete requests. +""" + +from __future__ import annotations + +import random +import time +from typing import Literal + +from pydantic import Field + +from guidellm.benchmark.schemas.base import BenchmarkAccumulator, BenchmarkConfig +from guidellm.scheduler import MultiTurnRequestT, SchedulerState +from guidellm.schemas import ( + GenerationRequest, + GenerationResponse, + GenerativeRequestStats, + RequestInfo, + RequestTimings, + StandardBaseModel, + StatusBreakdown, +) + +__all__ = [ + "GenerativeBenchmarkAccumulator", + "GenerativeBenchmarkTimings", + "GenerativeMetricsAccumulator", + "GenerativeRequestsAccumulator", + "RunningMetricStats", + "SchedulerMetricsAccumulator", +] + + +class GenerativeBenchmarkTimings(StandardBaseModel): + """ + Tracks timing phases and transitions during benchmark execution. + + Monitors timestamps throughout benchmark execution including request submission, + measurement period boundaries (warmup/active/cooldown), and completion events. + Provides duration calculations and phase status determination based on configured + warmup and cooldown periods. + """ + + request_start: float | None = Field( + description="Timestamp when the first request was sent", default=None + ) + measure_start: float | None = Field( + description="Timestamp when measurement period started", default=None + ) + measure_end: float | None = Field( + description="Timestamp when measurement period ended", default=None + ) + request_end: float | None = Field( + description="Timestamp when the last request was completed", default=None + ) + current_update: float | None = Field( + description="Most recent timestamp observed during execution", default=None + ) + current_request: float | None = Field( + description="Most recent request completion timestamp observed", default=None + ) + last_update: float | None = Field( + description="Previous timestamp observed before the current one", default=None + ) + last_request: float | None = Field( + description="Previous request completion timestamp before the current one", + default=None, + ) + + @property + def status(self) -> Literal["pending", "warmup", "active", "cooldown"]: + """ + :return: Current execution phase based on timing thresholds + """ + if self.request_start is None: + return "pending" + + if self.measure_start is None: + return "warmup" + + if self.measure_end is None: + return "active" + + return "cooldown" + + @property + def duration(self) -> float: + """ + :return: Elapsed time since measurement or request start in seconds + """ + if self.current_update is None: + return 0.0 + + start_time = self.measure_start or self.request_start + + return (self.current_update - start_time) if start_time is not None else 0.0 + + @property + def elapsed_time_last_update(self) -> float: + """ + :return: Time elapsed between the last two update timestamps in seconds + """ + if self.current_update is None or self.last_update is None: + return 0.0 + + return self.current_update - self.last_update + + @property + def elapsed_time_last_request(self) -> float: + """ + :return: Time elapsed between the last two request completions in seconds + """ + if self.current_request is None or self.last_request is None: + return 0.0 + + return self.current_request - self.last_request + + def update_estimate( + self, + info: RequestInfo, + scheduler_state: SchedulerState, + config: BenchmarkConfig, + ): + """ + Update timing estimates based on request info and scheduler state. + + Advances timing markers through benchmark phases (warmup to active to cooldown) + based on configured thresholds. Updates current/last timestamps for updates and + request completions, determining measurement period boundaries. + + :param info: Request information containing timing data + :param scheduler_state: Current scheduler state with progress metrics + :param config: Benchmark configuration with warmup/cooldown settings + """ + request_start = info.timings.request_start or info.timings.resolve_start + request_end = info.timings.request_end or info.timings.resolve_end + current_time = info.timings.last_reported + + self.request_start = self.request_start or request_start + + if request_end is not None and ( + self.request_end is None or request_end > self.request_end + ): + # Always update request end to the max seen so far + self.request_end = request_end + + # Update last and current update times + self.last_update = self.current_update + if current_time is not None and ( + self.current_update is None or current_time > self.current_update + ): + self.current_update = current_time + + # Update last and current request times, if applicable + if info.status in {"completed", "errored", "cancelled"}: + self.last_request = self.current_request + if request_end is not None and ( + self.current_request is None or request_end > self.current_request + ): + self.current_request = request_end + + # Update measurement start time based on warmup configuration + if config.warmup is not None and self.measure_start is None: + exceeded_time = ( + config.warmup >= 1.0 + and scheduler_state.remaining_duration is not None + and self.duration is not None + and self.duration >= config.warmup + ) + exceeded_count = ( + config.warmup >= 1.0 + and scheduler_state.remaining_requests is not None + and scheduler_state.processed_requests >= config.warmup + ) + exceeded_fraction = ( + config.warmup < 1.0 + and scheduler_state.remaining_fraction is not None + and 1.0 - scheduler_state.remaining_fraction >= config.warmup + ) + + if exceeded_time or exceeded_count or exceeded_fraction: + self.measure_start = self.current_update + elif config.warmup is None and self.measure_start is None: + # No warmup configured, start measuring at first request + self.measure_start = self.request_start + + # Update measurement end time based on cooldown configuration + if config.cooldown is not None and self.measure_end is None: + exceeded_time = ( + config.cooldown >= 1.0 + and scheduler_state.remaining_duration is not None + and scheduler_state.remaining_duration <= config.cooldown + ) + exceeded_count = ( + config.cooldown >= 1.0 + and scheduler_state.remaining_requests is not None + and scheduler_state.remaining_requests <= config.cooldown + ) + exceeded_fraction = ( + config.cooldown < 1.0 + and scheduler_state.remaining_fraction is not None + and scheduler_state.remaining_fraction <= config.cooldown + ) + + if exceeded_time or exceeded_count or exceeded_fraction: + self.measure_end = self.current_update + + +class RunningMetricStats(StandardBaseModel): + """ + Maintains running statistics for a metric stream without storing all samples. + + Accumulates count, sum, time-weighted sum, and duration to compute mean, rate, + and time-weighted statistics incrementally. Efficient for real-time metric tracking + during long-running benchmarks where storing individual samples is impractical. + """ + + count: int = Field(description="Number of samples accumulated", default=0) + value_sum: float = Field(description="Total sum of accumulated values", default=0.0) + time_weighted_sum: float = Field( + description="Time-weighted sum of accumulated values", default=0.0 + ) + duration: float = Field( + description="Total duration over which values were accumulated", default=0.0 + ) + last_value: float | None = Field( + description="Most recent value added to the accumulator", default=None + ) + + @property + def mean(self) -> float | None: + """ + :return: Arithmetic mean of accumulated values, or None if no samples + """ + if self.count <= 0: + return None + + return self.value_sum / self.count + + @property + def time_weighted_mean(self) -> float | None: + """ + :return: Time-weighted mean considering duration between samples, or None + """ + if self.duration <= 0.0: + return None + + return self.time_weighted_sum / self.duration + + @property + def rate_per_item(self) -> float | None: + """ + :return: Average value per accumulated item, or None if no samples + """ + if self.count <= 0: + return None + + return self.value_sum / self.count + + @property + def rate_per_second(self) -> float | None: + """ + :return: Average value per second of duration, or None if no duration + """ + if self.duration <= 0.0: + return None + + return self.value_sum / self.duration + + def update_estimate( + self, + value: float | None, + count: int = 1, + duration: float | None = None, + elapsed: float | None = None, + ): + """ + Incorporate a new metric value into running statistics. + + Updates count, sum, and time-weighted statistics using the new value and timing + information. Time-weighted calculations use the previous value over the elapsed + interval to capture sustained metric behavior. + + :param value: New metric value to accumulate + :param count: Number of occurrences this value represents + :param duration: Total duration to set, overriding incremental elapsed updates + :param elapsed: Time elapsed since last update for time-weighted calculations + """ + self.count += count + self.value_sum += (value or 0.0) * count + + if elapsed is not None: + self.time_weighted_sum += (self.last_value or 0.0) * elapsed + + self.duration = ( + duration if duration is not None else (self.duration + (elapsed or 0.0)) + ) + self.last_value = value + + +class SchedulerMetricsAccumulator(StandardBaseModel): + """ + Tracks scheduler-level timing and overhead metrics during execution. + + Monitors request lifecycle timing from queuing through completion, capturing delays + at each stage: queue time, worker start delays, request processing time, and + finalization overhead. Provides insight into scheduler efficiency and bottleneck + identification in request orchestration. + """ + + requests_made: StatusBreakdown[int, int, int, int] = Field( + description="Request counts by status: successful, incomplete, errored, total", + default_factory=lambda: StatusBreakdown[int, int, int, int]( + successful=0, errored=0, incomplete=0, total=0 + ), + ) + # Timings flow: + # Request scheduling: queued->dequeued->scheduled_at->resolve_start-> + # Request processing: request_start->*_iteration->request_end-> + # Request finalizing: resolve_end->finalized->accumulation update processed + queued_time: RunningMetricStats = Field( + default_factory=RunningMetricStats, + description="Running stats for time requests spent in the queue", + ) + resolve_start_delay: RunningMetricStats = Field( + default_factory=RunningMetricStats, + description=( + "Running stats for delay before worker begins resolving req after dequeue" + ), + ) + resolve_targeted_start_delay: RunningMetricStats = Field( + default_factory=RunningMetricStats, + description=( + "Running stats for delay from targeted start to actual worker start" + ), + ) + request_start_delay: RunningMetricStats = Field( + default_factory=RunningMetricStats, + description="Running stats for delay after resolve til request start", + ) + request_targeted_start_delay: RunningMetricStats = Field( + default_factory=RunningMetricStats, + description=( + "Running stats for delay from targeted start to actual request start" + ), + ) + request_time: RunningMetricStats = Field( + default_factory=RunningMetricStats, + description="Running stats for request processing time", + ) + resolve_end_delay: RunningMetricStats = Field( + default_factory=RunningMetricStats, + description="Running stats for delay after request end till worker resolves", + ) + resolve_time: RunningMetricStats = Field( + default_factory=RunningMetricStats, + description="Running stats for time for worker to resolve requests", + ) + finalized_delay: RunningMetricStats = Field( + default_factory=RunningMetricStats, + description="Running stats for delay after resolve til finalized in scheduler", + ) + processed_delay: RunningMetricStats = Field( + default_factory=RunningMetricStats, + description=( + "Running stats for delay from finalized til request being " + "processed by accumulation" + ), + ) + + def update_estimate( + self, scheduler_state: SchedulerState, stats: GenerativeRequestStats + ): + """ + Update scheduler metrics with completed request timing data. + + Extracts timing information from request statistics to update running metrics + for each scheduler lifecycle stage. Validates that required timing markers are + present before processing. + + :param scheduler_state: Current scheduler state with request counts + :param stats: Completed request statistics with detailed timing information + :raises ValueError: If required timing markers are missing + """ + # Update request counts + self.requests_made.successful = scheduler_state.successful_requests + self.requests_made.errored = scheduler_state.errored_requests + self.requests_made.incomplete = scheduler_state.cancelled_requests + self.requests_made.total = ( + scheduler_state.successful_requests + + scheduler_state.errored_requests + + scheduler_state.cancelled_requests + ) + + # All requests must have queued, dequeued, resolve_end, and finalized timings + timings: RequestTimings = stats.info.timings + if any( + timing is None + for timing in [ + timings.queued, + timings.dequeued, + timings.resolve_end, + timings.finalized, + ] + ): + raise ValueError( + "Required timings 'queued', 'dequeued', 'resolve_end', and " + "'finalized' must not be None" + ) + + # Store validated non-None timings for type safety + queued: float = timings.queued # type: ignore[assignment] + dequeued: float = timings.dequeued # type: ignore[assignment] + resolve_end: float = timings.resolve_end # type: ignore[assignment] + finalized: float = timings.finalized # type: ignore[assignment] + + # Update timing metrics in occurrence order + self.queued_time.update_estimate(value=dequeued - queued) + + if timings.scheduled_at is not None and timings.resolve_start is not None: + self.resolve_start_delay.update_estimate( + value=timings.resolve_start - timings.scheduled_at + ) + + if timings.targeted_start is not None and timings.resolve_start is not None: + self.resolve_targeted_start_delay.update_estimate( + value=timings.resolve_start - timings.targeted_start + ) + + if timings.resolve_start is not None and timings.request_start is not None: + self.request_start_delay.update_estimate( + value=timings.request_start - timings.resolve_start + ) + + if timings.targeted_start is not None and timings.request_start is not None: + self.request_targeted_start_delay.update_estimate( + value=timings.request_start - timings.targeted_start + ) + + if timings.request_start is not None and timings.request_end is not None: + self.request_time.update_estimate( + value=timings.request_end - timings.request_start + ) + + if timings.request_end is not None: + self.resolve_end_delay.update_estimate( + value=resolve_end - timings.request_end + ) + + if timings.resolve_start is not None: + self.resolve_time.update_estimate(value=resolve_end - timings.resolve_start) + + self.finalized_delay.update_estimate(value=finalized - resolve_end) + self.processed_delay.update_estimate(value=time.time() - finalized) + + +class GenerativeMetricsAccumulator(StandardBaseModel): + """ + Accumulates generative model performance metrics during execution. + + Tracks token throughput, latency characteristics, and request timing for generative + workloads. Maintains running statistics for input/output tokens, + time-to-first-token, inter-token latency, and streaming patterns for comprehensive + performance analysis. + """ + + requests: RunningMetricStats = Field( + default_factory=RunningMetricStats, + description="Accumulated request count statistics", + ) + request_latency: RunningMetricStats = Field( + default_factory=RunningMetricStats, + description="Accumulated request latency statistics", + ) + input_tokens: RunningMetricStats = Field( + default_factory=RunningMetricStats, + description="Accumulated input token count statistics", + ) + output_tokens: RunningMetricStats = Field( + default_factory=RunningMetricStats, + description="Accumulated output token count statistics", + ) + total_tokens: RunningMetricStats = Field( + default_factory=RunningMetricStats, + description="Accumulated total token count statistics", + ) + time_to_first_token_ms: RunningMetricStats = Field( + default_factory=RunningMetricStats, + description="Accumulated time to first token statistics in milliseconds", + ) + time_per_output_token_ms: RunningMetricStats = Field( + default_factory=RunningMetricStats, + description="Accumulated time per output token statistics in milliseconds", + ) + inter_token_latency_ms: RunningMetricStats = Field( + default_factory=RunningMetricStats, + description="Accumulated inter-token latency statistics in milliseconds", + ) + streaming_iterations: RunningMetricStats = Field( + default_factory=RunningMetricStats, + description="Accumulated streaming iteration count statistics", + ) + output_tokens_by_iteration: RunningMetricStats = Field( + default_factory=RunningMetricStats, + description="Accumulated output tokens per iteration statistics", + ) + iter_tokens_by_iteration: RunningMetricStats = Field( + default_factory=RunningMetricStats, + description="Accumulated iteration tokens per iteration statistics", + ) + + def update_estimate(self, stats: GenerativeRequestStats, duration: float): + """ + Update generative metrics with completed request statistics. + + Incorporates token counts, latency measurements, and streaming characteristics + from a completed request into running metric accumulators with time-weighted + calculations. + + :param stats: Request statistics containing token and latency measurements + :param duration: Current benchmark duration for time-weighted metrics + """ + self.requests.update_estimate(1.0, duration=duration) + self.input_tokens.update_estimate(stats.input_tokens, duration=duration) + self.output_tokens.update_estimate(stats.output_tokens, duration=duration) + self.total_tokens.update_estimate(stats.total_tokens, duration=duration) + self.request_latency.update_estimate(stats.request_latency, duration=duration) + self.time_to_first_token_ms.update_estimate( + stats.time_to_first_token_ms, duration=duration + ) + self.time_per_output_token_ms.update_estimate( + stats.time_per_output_token_ms, + count=int(stats.output_tokens or 0), + duration=duration, + ) + self.inter_token_latency_ms.update_estimate( + stats.inter_token_latency_ms, + count=int((stats.output_tokens or 1) - 1), + duration=duration, + ) + self.streaming_iterations.update_estimate( + stats.token_iterations, duration=duration + ) + self.output_tokens_by_iteration.update_estimate( + stats.output_tokens_per_iteration, + count=int(stats.token_iterations or 0), + duration=duration, + ) + self.iter_tokens_by_iteration.update_estimate( + stats.iter_tokens_per_iteration, + count=int((stats.token_iterations or 1) - 1), + duration=duration, + ) + + +class GenerativeRequestsAccumulator(StandardBaseModel): + """ + Manages request statistics collection with optional reservoir sampling. + + Collects detailed request statistics while optionally sampling to limit memory usage + in long-running benchmarks. Supports configurable sampling rates and selective data + retention (clearing request arguments and/or outputs for non-sampled requests). + """ + + sample_requests: int | None = Field( + default=None, + description=( + "Number of requests to sample and keep in the final benchmark for metrics" + ), + ) + requests_stats: list[GenerativeRequestStats] = Field( + description="List of generative request statistics", default_factory=list + ) + samples: list[int] | None = Field( + description="Indices of sampled generative requests", default=None + ) + clear_nonsampled_request_args: bool = Field( + default=True, + description=( + "Whether to clear request arguments and outputs for non-sampled requests" + ), + ) + clear_nonsampled_outputs: bool = Field( + default=True, + description=( + "Whether to clear outputs for non-sampled requests while keeping args" + ), + ) + + def get_sampled(self) -> list[GenerativeRequestStats]: + """ + Retrieve the list of sampled request statistics. + + :return: List of sampled generative request statistics + """ + if self.samples is None: + return self.requests_stats + + return [self.requests_stats[ind] for ind in self.samples] + + def get_within_range( + self, start_time: float, end_time: float + ) -> list[GenerativeRequestStats]: + """ + Retrieve request statistics within a specified time range. + + :param start_time: Start timestamp for filtering (requests must end after this) + :param end_time: End timestamp for filtering (requests must start before this) + :return: List of request statistics within the time range + """ + return [ + stats + for stats in self.requests_stats + if (stats.request_end_time >= start_time) + and ( + ( + stats.request_start_time is not None + and stats.request_start_time <= end_time + ) + or ( + stats.request_start_time is None + and stats.request_end_time <= end_time + ) + ) + ] + + def update_estimate( + self, + response: GenerationResponse | None, + request: GenerationRequest | MultiTurnRequestT[GenerationRequest], + info: RequestInfo, + prefer_response_metrics: bool, + ) -> GenerativeRequestStats: + """ + Record request statistics and apply reservoir sampling if configured. + + Compiles statistics from the completed request and adds to the collection. + Uses reservoir sampling algorithm to maintain uniform sample distribution when + enabled, clearing non-sampled request data to manage memory. + + :param response: Generation response containing output and metrics + :param request: Original generation request with input data + :param info: Request execution information and timing + :param prefer_response_metrics: Whether to prefer metrics from response + :return: Compiled request statistics + """ + stats = self.compile_stats(response, request, info, prefer_response_metrics) + + current_index = len(self.requests_stats) + self.requests_stats.append(stats) + + if self.sample_requests is None: + # Keeping all requests, don't need to sample + self.samples = None + elif self.sample_requests <= 0: + # Not keeping any requests, clear out unnecessary memory usage for current + self.clear_stats_data(stats) + elif self.sample_requests >= len(self.requests_stats): + # Add directly to samples, haven't filled yet + if self.samples is None: + self.samples = [] + self.samples.append(current_index) + elif self.sample_requests / len(self.requests_stats) >= random.random(): + # Sampling logic: choose to replace with decreasing probability s / n + # where s is sample size, n is current number of requests. + # If chosen, choose random existing sample to replace. + # P(new item in samples) = s / n + # P(prev item in samples) = P(item was in samples) * P(not replaced) + # P(prev item in samples) = + # P(before replacement) * P(new item selected) * P(chosen from samples) + # P(prev item in samples) = (s / (n - 1)) * (s / n) * (1 / s) = s / n + # P(prev item in samples) = P(new item in samples) + if self.samples is None: + self.samples = [] + replace_index = random.randrange(len(self.samples)) + self.clear_stats_data(self.samples[replace_index]) + self.samples[replace_index] = current_index + + return stats + + def clear_stats_data(self, stats: GenerativeRequestStats | int): + if isinstance(stats, int): + stats = self.requests_stats[stats] + + if self.clear_nonsampled_request_args: + stats.request_args = None + if self.clear_nonsampled_outputs: + stats.output = None + + @classmethod + def compile_stats( + cls, + response: GenerationResponse | None, + request: GenerationRequest | MultiTurnRequestT[GenerationRequest], + info: RequestInfo, + prefer_response_metrics: bool, + ) -> GenerativeRequestStats: + """ + Compile statistics from request, response, and execution info. + + :param response: Generation response with output and metrics, or None + :param request: Original generation request with input data + :param info: Request execution information and timing + :param prefer_response_metrics: Whether to prefer metrics from response + :return: Compiled generative request statistics + """ + # Extract the first request for arguments if multi-turn + first_request: GenerationRequest + if isinstance(request, GenerationRequest): + first_request = request + else: + # Multi-turn request: extract first item + first_item = request[0] + first_request = ( + first_item[0] if isinstance(first_item, tuple) else first_item + ) + + if response is None: + response = GenerationResponse( + request_id=info.request_id, request_args=str(first_request.arguments) + ) + + return response.compile_stats( + request=first_request, + info=info, + prefer_response=prefer_response_metrics, + ) + + +class GenerativeBenchmarkAccumulator( + BenchmarkAccumulator[GenerationRequest, GenerationResponse] +): + """ + Primary accumulator for generative benchmark execution metrics and statistics. + + Orchestrates real-time metric collection across timing, scheduler, concurrency, and + generative performance dimensions. Maintains separate accumulators for completed, + errored, and incomplete requests while tracking overall metrics. Integrates with + scheduler state to monitor warmup/cooldown phases and compute time-weighted + statistics for throughput and latency analysis. + """ + + timings: GenerativeBenchmarkTimings = Field( + default_factory=GenerativeBenchmarkTimings, + description="Timing phases and transitions during benchmark execution", + ) + completed: GenerativeRequestsAccumulator = Field( + default_factory=GenerativeRequestsAccumulator, + description="Accumulator for completed requests", + ) + errored: GenerativeRequestsAccumulator = Field( + default_factory=GenerativeRequestsAccumulator, + description="Accumulator for errored requests", + ) + incomplete: GenerativeRequestsAccumulator = Field( + default_factory=GenerativeRequestsAccumulator, + description="Accumulator for incomplete requests", + ) + scheduler_metrics: SchedulerMetricsAccumulator = Field( + default_factory=SchedulerMetricsAccumulator, + description="Running metrics for scheduler state", + ) + concurrency_metric: RunningMetricStats = Field( + default_factory=RunningMetricStats, + description="Accumulated request concurrency statistics", + ) + total_metrics: GenerativeMetricsAccumulator = Field( + default_factory=GenerativeMetricsAccumulator, + description="Running metrics for all requests", + ) + completed_metrics: GenerativeMetricsAccumulator = Field( + default_factory=GenerativeMetricsAccumulator, + description="Running metrics for completed requests", + ) + errored_metrics: GenerativeMetricsAccumulator = Field( + default_factory=GenerativeMetricsAccumulator, + description="Running metrics for errored requests", + ) + incomplete_metrics: GenerativeMetricsAccumulator = Field( + default_factory=GenerativeMetricsAccumulator, + description="Running metrics for incomplete requests", + ) + + def update_estimate( + self, + response: GenerationResponse | None, + request: GenerationRequest | MultiTurnRequestT[GenerationRequest], + info: RequestInfo, + scheduler_state: SchedulerState, + ): + """ + Update all benchmark metrics with a completed request. + + Processes request completion by updating timing phases, concurrency metrics, + scheduler statistics, and generative performance metrics. Routes request to + appropriate status-specific accumulator (completed/errored/incomplete) and + updates aggregate totals. Cancelled requests that never started are ignored. + + :param response: Generation response with output and metrics, or None + :param request: Original generation request with input data + :param info: Request execution information and timing + :param scheduler_state: Current scheduler state for phase tracking + """ + if info.status == "cancelled" and info.timings.resolve_start is None: + # Cancelled requests that never started should be ignored + return + + self.timings.update_estimate(info, scheduler_state, self.config) + + duration = self.timings.duration + elapsed_time_last_update = self.timings.elapsed_time_last_update + self.concurrency_metric.update_estimate( + value=scheduler_state.processing_requests, + duration=duration, + elapsed=elapsed_time_last_update, + ) + + requests_accumulator: GenerativeRequestsAccumulator + metrics_accumulator: GenerativeMetricsAccumulator + + if info.status == "completed": + requests_accumulator = self.completed + metrics_accumulator = self.completed_metrics + elif info.status == "errored": + requests_accumulator = self.errored + metrics_accumulator = self.errored_metrics + elif info.status == "cancelled": + requests_accumulator = self.incomplete + metrics_accumulator = self.incomplete_metrics + else: + return + + stats = requests_accumulator.update_estimate( + response, request, info, self.config.prefer_response_metrics + ) + metrics_accumulator.update_estimate(stats, duration) + self.total_metrics.update_estimate(stats, duration) + self.scheduler_metrics.update_estimate(scheduler_state, stats) diff --git a/src/guidellm/benchmark/schemas/generative/benchmark.py b/src/guidellm/benchmark/schemas/generative/benchmark.py new file mode 100644 index 00000000..1d7f83ca --- /dev/null +++ b/src/guidellm/benchmark/schemas/generative/benchmark.py @@ -0,0 +1,141 @@ +""" +Benchmark data models and metrics for generative AI performance measurement. + +Provides comprehensive data structures for capturing, storing, and analyzing +benchmark results from scheduler-driven generative AI workload executions. +Core abstractions include base benchmark interfaces, generative-specific +metrics with token/latency distributions, request-level statistics tracking, +and multi-benchmark reporting capabilities. These models enable detailed +performance analysis including throughput, latency, concurrency patterns, and +domain-specific metrics for text, image, video, and audio generation tasks. +""" + +from __future__ import annotations + +from typing import Literal + +from pydantic import Field, computed_field + +from guidellm.benchmark.schemas.base import Benchmark, BenchmarkConfig +from guidellm.benchmark.schemas.generative.accumulator import ( + GenerativeBenchmarkAccumulator, +) +from guidellm.benchmark.schemas.generative.metrics import ( + GenerativeMetrics, + SchedulerMetrics, +) +from guidellm.scheduler import SchedulerState +from guidellm.schemas import ( + GenerativeRequestStats, + StatusBreakdown, + StatusDistributionSummary, +) + +__all__ = ["GenerativeBenchmark"] + + +class GenerativeBenchmark(Benchmark[GenerativeBenchmarkAccumulator]): + """ + Complete generative AI benchmark results with specialized metrics. + + Encapsulates comprehensive performance data from scheduler-driven generative + workload executions including request-level statistics, token/latency distributions, + throughput analysis, and concurrency patterns. Provides computed fields for temporal + analysis and status-grouped request details for detailed post-execution reporting. + """ + + type_: Literal["generative_benchmark"] = "generative_benchmark" # type: ignore[assignment] + + config: BenchmarkConfig = Field( + description="Configuration parameters for this benchmark execution", + ) + scheduler_state: SchedulerState = Field( + description="Final state of the scheduler after benchmark completion", + ) + scheduler_metrics: SchedulerMetrics = Field( + description="Scheduler timing and performance statistics", + ) + metrics: GenerativeMetrics = Field( + description="Performance metrics and statistical distributions", + ) + requests: StatusBreakdown[ + list[GenerativeRequestStats], + list[GenerativeRequestStats], + list[GenerativeRequestStats], + None, + ] = Field( + description=( + "Request details grouped by status: successful, incomplete, errored" + ), + ) + + @computed_field # type: ignore[prop-decorator] + @property + def start_time(self) -> float: + """ + :return: Benchmark start time in seconds since epoch + """ + return self.scheduler_metrics.measure_start_time + + @computed_field # type: ignore[prop-decorator] + @property + def end_time(self) -> float: + """ + :return: Benchmark end time in seconds since epoch + """ + return self.scheduler_metrics.measure_end_time + + @computed_field # type: ignore[prop-decorator] + @property + def duration(self) -> float: + """ + :return: Total benchmark execution duration in seconds + """ + return self.end_time - self.start_time + + @property + def request_latency(self) -> StatusDistributionSummary: + """ + :return: Statistical distribution of request latencies across all requests + """ + return self.metrics.request_latency + + @property + def request_throughput(self) -> StatusDistributionSummary: + """ + :return: Statistical distribution of throughput measured in requests per second + """ + return self.metrics.requests_per_second + + @property + def request_concurrency(self) -> StatusDistributionSummary: + """ + :return: Statistical distribution of concurrent requests throughout execution + """ + return self.metrics.request_concurrency + + @classmethod + def compile( + cls, + accumulator: GenerativeBenchmarkAccumulator, + scheduler_state: SchedulerState, + ) -> GenerativeBenchmark: + """ + Compile final benchmark results from accumulated execution state. + + :param accumulator: Accumulated benchmark state with request statistics + :param scheduler_state: Final scheduler state after execution completion + :return: Compiled generative benchmark instance with complete metrics + """ + return GenerativeBenchmark( + config=accumulator.config, + scheduler_state=scheduler_state, + scheduler_metrics=SchedulerMetrics.compile(accumulator, scheduler_state), + metrics=GenerativeMetrics.compile(accumulator), + requests=StatusBreakdown( + successful=accumulator.completed.get_sampled(), + incomplete=accumulator.incomplete.get_sampled(), + errored=accumulator.errored.get_sampled(), + total=None, + ), + ) diff --git a/src/guidellm/benchmark/schemas/generative/entrypoints.py b/src/guidellm/benchmark/schemas/generative/entrypoints.py new file mode 100644 index 00000000..c54b93b9 --- /dev/null +++ b/src/guidellm/benchmark/schemas/generative/entrypoints.py @@ -0,0 +1,361 @@ +""" +Configuration entrypoints for generative text benchmark execution. + +Defines parameter schemas and construction logic for creating benchmark runs from +scenario files or runtime arguments. Provides flexible configuration loading with +support for built-in scenarios, custom YAML/JSON files, and programmatic overrides. +Handles serialization of complex types including backends, processors, and profiles +for persistent storage and reproduction of benchmark configurations. +""" + +from __future__ import annotations + +import inspect +import json +from collections.abc import Callable +from pathlib import Path +from typing import Any, Literal + +import yaml +from pydantic import ( + AliasChoices, + AliasGenerator, + ConfigDict, + Field, + ValidationError, + ValidatorFunctionWrapHandler, + field_validator, + model_serializer, +) +from torch.utils.data import Sampler +from transformers import PreTrainedTokenizerBase + +from guidellm.backends import Backend, BackendType +from guidellm.benchmark.profile import Profile, ProfileType +from guidellm.benchmark.scenarios import get_builtin_scenarios +from guidellm.data import DatasetPreprocessor, RequestFormatter +from guidellm.scheduler import StrategyType +from guidellm.schemas import StandardBaseModel + +__all__ = ["BenchmarkGenerativeTextArgs"] + + +class BenchmarkGenerativeTextArgs(StandardBaseModel): + """ + Configuration arguments for generative text benchmark execution. + + Defines all parameters for benchmark setup including target endpoint, data + sources, backend configuration, processing pipeline, output formatting, and + execution constraints. Supports loading from scenario files and merging with + runtime overrides for flexible benchmark construction from multiple sources. + + Example:: + + # Load from built-in scenario with overrides + args = BenchmarkGenerativeTextArgs.create( + scenario="chat", + target="http://localhost:8000/v1", + max_requests=1000 + ) + + # Create from keyword arguments only + args = BenchmarkGenerativeTextArgs( + target="http://localhost:8000/v1", + data=["path/to/dataset.json"], + profile="fixed", + rate=10.0 + ) + """ + + @classmethod + def create( + cls, scenario: Path | str | None, **kwargs: dict[str, Any] + ) -> BenchmarkGenerativeTextArgs: + """ + Create benchmark args from scenario file and keyword arguments. + + Loads base configuration from scenario file (built-in or custom) and merges + with provided keyword arguments. Arguments explicitly set via kwargs override + scenario values, while defaulted kwargs are ignored to preserve scenario + settings. + + :param scenario: Path to scenario file, built-in scenario name, or None + :param kwargs: Keyword arguments to override scenario values + :return: Configured benchmark args instance + :raises ValueError: If scenario is not found or file format is unsupported + """ + constructor_kwargs = {} + + if scenario is not None: + if isinstance(scenario, str) and scenario in ( + builtin_scenarios := get_builtin_scenarios() + ): + scenario_path = builtin_scenarios[scenario] + elif Path(scenario).exists() and Path(scenario).is_file(): + scenario_path = Path(scenario) + else: + raise ValueError(f"Scenario '{scenario}' not found.") + + with scenario_path.open() as file: + if scenario_path.suffix == ".json": + scenario_data = json.load(file) + elif scenario_path.suffix in {".yaml", ".yml"}: + scenario_data = yaml.safe_load(file) + else: + raise ValueError( + f"Unsupported scenario file format: {scenario_path.suffix}" + ) + if "args" in scenario_data: + # loading from a report file + scenario_data = scenario_data["args"] + constructor_kwargs.update(scenario_data) + + # Apply overrides from kwargs + constructor_kwargs.update(kwargs) + + return cls.model_validate(constructor_kwargs) + + @classmethod + def get_default(cls: type[BenchmarkGenerativeTextArgs], field: str) -> Any: + """ + Retrieve default value for a model field. + + Extracts the default value from field metadata, handling both static defaults + and factory functions. + + :param field: Field name to retrieve default value for + :return: Default value for the field + :raises ValueError: If field does not exist + """ + if field not in cls.model_fields: + raise ValueError(f"Field '{field}' not found in {cls.__name__}") + + field_info = cls.model_fields[field] + factory = field_info.default_factory + + if factory is None: + return field_info.default + + if len(inspect.signature(factory).parameters) == 0: + return factory() # type: ignore[call-arg] + else: + return factory({}) # type: ignore[call-arg] + + model_config = ConfigDict( + extra="ignore", + use_enum_values=True, + from_attributes=True, + arbitrary_types_allowed=True, + validate_by_alias=True, + validate_by_name=True, + alias_generator=AliasGenerator( + # Support field names with hyphens + validation_alias=lambda field_name: AliasChoices( + field_name, field_name.replace("_", "-") + ), + ), + ) + + # Required + target: str = Field(description="Target endpoint URL for benchmark execution") + data: list[Any] = Field( + description="List of dataset sources or data files", + default_factory=list, + min_length=1, + ) + # Benchmark configuration + profile: StrategyType | ProfileType | Profile = Field( + default="sweep", description="Benchmark profile or scheduling strategy type" + ) + rate: list[float] | None = Field( + default=None, description="Request rate(s) for rate-based scheduling" + ) + # Backend configuration + backend: BackendType | Backend = Field( + default="openai_http", description="Backend type or instance for execution" + ) + backend_kwargs: dict[str, Any] | None = Field( + default=None, description="Additional backend configuration arguments" + ) + model: str | None = Field(default=None, description="Model identifier for backend") + # Data configuration + processor: str | Path | PreTrainedTokenizerBase | None = Field( + default=None, description="Tokenizer path, name, or instance for processing" + ) + processor_args: dict[str, Any] | None = Field( + default=None, description="Additional tokenizer configuration arguments" + ) + data_args: list[dict[str, Any]] | None = Field( + default_factory=list, # type: ignore[arg-type] + description="Per-dataset configuration arguments", + ) + data_samples: int = Field( + default=-1, description="Number of samples to use from datasets (-1 for all)" + ) + data_column_mapper: ( + DatasetPreprocessor + | dict[str, str | list[str]] + | Literal["generative_column_mapper"] + ) = Field( + default="generative_column_mapper", + description="Column mapping preprocessor for dataset fields", + ) + data_request_formatter: RequestFormatter | dict[str, str] | str = Field( + default="chat_completions", + description="Request formatting preprocessor or template name", + validation_alias=AliasChoices( + "data_request_formatter", + "data-request-formatter", + "request_type", + "request-type", + ), + ) + data_collator: Callable | Literal["generative"] | None = Field( + default="generative", description="Data collator for batch processing" + ) + data_sampler: Sampler[int] | Literal["shuffle"] | None = Field( + default=None, description="Data sampler for request ordering" + ) + data_num_workers: int | None = Field( + default=None, description="Number of workers for data loading" + ) + dataloader_kwargs: dict[str, Any] | None = Field( + default=None, description="Additional dataloader configuration arguments" + ) + random_seed: int = Field(default=42, description="Random seed for reproducibility") + # Output configuration + output_path: str | Path | None = Field( + default_factory=Path.cwd, description="Directory path for output files" + ) + output_formats: list[str] | dict[str, str | dict[str, Any]] | None = Field( + default_factory=lambda: ["console", "json", "csv"], + description="Output format names or configuration mappings", + ) + # Benchmarker configuration + sample_requests: int | None = Field( + default=10, + description="Number of requests to sample for detailed metrics (None for all)", + ) + warmup: float | None = Field( + default=None, + description="Warmup period in seconds, requests, or fraction (0-1)", + ) + cooldown: float | None = Field( + default=None, + description="Cooldown period in seconds, requests, or fraction (0-1)", + ) + prefer_response_metrics: bool = Field( + default=True, + description="Whether to prefer backend response metrics over request metrics", + ) + # Constraints configuration + max_seconds: int | float | None = Field( + default=None, description="Maximum benchmark execution time in seconds" + ) + max_requests: int | None = Field( + default=None, description="Maximum number of requests to execute" + ) + max_errors: int | None = Field( + default=None, description="Maximum number of errors before stopping" + ) + max_error_rate: float | None = Field( + default=None, description="Maximum error rate (0-1) before stopping" + ) + max_global_error_rate: float | None = Field( + default=None, description="Maximum global error rate (0-1) before stopping" + ) + + @field_validator("data", "data_args", "rate", mode="wrap") + @classmethod + def single_to_list( + cls, value: Any, handler: ValidatorFunctionWrapHandler + ) -> list[Any]: + """ + Ensures field is always a list. + + :param value: Input value for the 'data' field + :return: List of data sources + """ + try: + return handler(value) + except ValidationError as err: + # If validation fails, try wrapping the value in a list + if err.errors()[0]["type"] == "list_type": + return handler([value]) + else: + raise + + @model_serializer + def serialize_model(self) -> dict[str, Any]: + """ + Convert model to serializable dictionary format. + + Transforms complex types (Backend, Profile, Path, etc.) to JSON-compatible + primitives while preserving configuration semantics for storage and + reproduction. + + :return: Dictionary representation for JSON/YAML serialization + """ + return { + # target - serialize as is + "target": self.target, + "data": [ + item if isinstance(item, str | type(None)) else str(item) + for item in self.data + ], # data - for each item in the list, if not a str or None, save str(item) + "profile": ( + self.profile.type_ + if isinstance(self.profile, Profile) + else self.profile + ), # profile - if instance of Profile, then save as profile.type_ + "rate": self.rate, + "backend": ( + self.backend.type_ + if isinstance(self.backend, Backend) + else self.backend + ), # backend - if instance of Backend, then save as backend.type_ + "backend_kwargs": self.backend_kwargs, + "model": self.model, + "processor": ( + self.processor + if isinstance(self.processor, str) + else str(self.processor) + if self.processor is not None + else None + ), # processor - if not str, then save as str(processor) + "processor_args": self.processor_args, + "data_args": self.data_args, + "data_samples": self.data_samples, + "data_column_mapper": ( + self.data_column_mapper + if isinstance(self.data_column_mapper, dict | str) + else {} + ), # data_column_mapper - if not dict or str, then save as an empty dict + "data_request_formatter": ( + self.data_request_formatter + if isinstance(self.data_request_formatter, dict | str) + else {} + ), # data_request_formatter - if not dict or str, then save as empty dict + "data_collator": ( + self.data_collator if isinstance(self.data_collator, str) else None + ), # data_collator - if not str, then save as None + "data_sampler": ( + self.data_sampler if isinstance(self.data_sampler, str) else None + ), # data_sampler - if not str, then save as None + "data_num_workers": self.data_num_workers, + "dataloader_kwargs": self.dataloader_kwargs, + "random_seed": self.random_seed, + "output_path": ( + str(self.output_path) if self.output_path is not None else None + ), # output_path - if not None, then ensure it's a str + "output_formats": self.output_formats, + "sample_requests": self.sample_requests, + "warmup": self.warmup, + "cooldown": self.cooldown, + "prefer_response_metrics": self.prefer_response_metrics, + "max_seconds": self.max_seconds, + "max_requests": self.max_requests, + "max_errors": self.max_errors, + "max_error_rate": self.max_error_rate, + "max_global_error_rate": self.max_global_error_rate, + } diff --git a/src/guidellm/benchmark/schemas/generative/metrics.py b/src/guidellm/benchmark/schemas/generative/metrics.py new file mode 100644 index 00000000..82f44f37 --- /dev/null +++ b/src/guidellm/benchmark/schemas/generative/metrics.py @@ -0,0 +1,931 @@ +""" +Metrics schemas for generative AI benchmark results and performance analysis. + +This module defines comprehensive metric structures for tracking and analyzing +generative AI benchmark performance across multiple dimensions including request +statistics, token metrics, and domain-specific measurements for text, image, video, +and audio generation. It provides statistical summaries with distribution analysis +across successful, incomplete, and errored requests, along with scheduler-level +performance metrics for request processing and queueing behavior. +""" + +from __future__ import annotations + +from typing import Literal + +from pydantic import Field + +from guidellm.benchmark.schemas.generative.accumulator import ( + GenerativeBenchmarkAccumulator, +) +from guidellm.scheduler import SchedulerState +from guidellm.schemas import ( + GenerativeRequestStats, + StandardBaseDict, + StatusBreakdown, + StatusDistributionSummary, +) + +__all__ = [ + "GenerativeAudioMetricsSummary", + "GenerativeImageMetricsSummary", + "GenerativeMetrics", + "GenerativeMetricsSummary", + "GenerativeTextMetricsSummary", + "GenerativeVideoMetricsSummary", + "SchedulerMetrics", + "StatusTypes", + "TimedMetricTypeAlias", +] + + +TimedMetricTypeAlias = ( + tuple[float, float, int | float | None, int | float | None] | None +) +"""Timed metric tuple containing start_time, end_time, input_value, and output_value.""" + +StatusTypes = Literal["successful", "incomplete", "errored"] +"""Request status category for metric compilation.""" + +# Constants for tuple indexing +_TIMED_METRIC_START_TIME_INDEX = 0 +_TIMED_METRIC_END_TIME_INDEX = 1 +_TIMED_METRIC_INPUT_VALUE_INDEX = 2 +_TIMED_METRIC_OUTPUT_VALUE_INDEX = 3 + + +class SchedulerMetrics(StandardBaseDict): + """ + Scheduler timing and performance statistics. + + Tracks overall benchmark timing, request counts by status, and detailed internal + scheduler performance metrics including queue times, processing delays, and + request execution statistics. Used to analyze scheduler efficiency and identify + bottlenecks in request processing pipelines. + """ + + # Overall timings for the scheduler + start_time: float = Field( + description="Unix timestamp when the benchmark run started" + ) + request_start_time: float = Field( + description="Unix timestamp when first request was made" + ) + measure_start_time: float = Field( + description="Unix timestamp when measurement period started" + ) + measure_end_time: float = Field( + description="Unix timestamp when measurement period ended" + ) + request_end_time: float = Field( + description="Unix timestamp when last request completed" + ) + end_time: float = Field(description="Unix timestamp when the benchmark run ended") + + # Request details tracked by the scheduler + requests_made: StatusBreakdown[int, int, int, int] = Field( + description="Request counts by status: successful, incomplete, errored, total" + ) + + # Scheduler internal performance timings + queued_time_avg: float = Field( + description="Avg time requests spent in the queue (seconds)" + ) + resolve_start_delay_avg: float = Field( + description="Avg delay before worker begins resolving req after dequeue (sec)" + ) + resolve_targeted_start_delay_avg: float = Field( + description="Avg delay to targeted resolve start time (seconds)" + ) + request_start_delay_avg: float = Field( + description="Avg delay before request starts after resolve (seconds)" + ) + request_targeted_start_delay_avg: float = Field( + description="Avg delay to targeted request start time (seconds)" + ) + request_time_avg: float = Field(description="Avg request execution time (seconds)") + resolve_end_delay_avg: float = Field( + description="Avg delay after request completes before resolve ends (seconds)" + ) + resolve_time_avg: float = Field( + description="Avg total resolve time including request (seconds)" + ) + finalized_delay_avg: float = Field( + description="Avg delay from resolve end to request finalization (seconds)" + ) + processed_delay_avg: float = Field( + description="Avg delay from finalization to processing completion (seconds)" + ) + + @classmethod + def compile( + cls, + accumulator: GenerativeBenchmarkAccumulator, + scheduler_state: SchedulerState, + ) -> SchedulerMetrics: + """ + Compile scheduler metrics from accumulator and scheduler state. + + :param accumulator: Benchmark accumulator containing timing and metric data + :param scheduler_state: Scheduler state with execution timing information + :return: Compiled scheduler metrics with performance statistics + """ + return SchedulerMetrics( + # Overall timings for the scheduler + start_time=scheduler_state.start_time, + request_start_time=accumulator.timings.request_start or -1.0, + measure_start_time=accumulator.timings.measure_start or -1.0, + measure_end_time=( + accumulator.timings.measure_end + or accumulator.timings.request_end + or -1.0 + ), # if no cooldown, measure_end isn't set, use request_end + request_end_time=accumulator.timings.request_end or -1.0, + end_time=scheduler_state.end_time or -1.0, + # Request details tracked by the scheduler + requests_made=accumulator.scheduler_metrics.requests_made, + # Scheduler internal performance timings + queued_time_avg=accumulator.scheduler_metrics.queued_time.mean or -1.0, + resolve_start_delay_avg=( + accumulator.scheduler_metrics.resolve_start_delay.mean or -1.0 + ), + resolve_targeted_start_delay_avg=( + accumulator.scheduler_metrics.resolve_targeted_start_delay.mean or -1.0 + ), + request_start_delay_avg=( + accumulator.scheduler_metrics.request_start_delay.mean or -1.0 + ), + request_targeted_start_delay_avg=( + accumulator.scheduler_metrics.request_targeted_start_delay.mean or -1.0 + ), + request_time_avg=accumulator.scheduler_metrics.request_time.mean or -1.0, + resolve_end_delay_avg=( + accumulator.scheduler_metrics.resolve_end_delay.mean or -1.0 + ), + resolve_time_avg=accumulator.scheduler_metrics.resolve_time.mean or -1.0, + finalized_delay_avg=( + accumulator.scheduler_metrics.finalized_delay.mean or -1.0 + ), + processed_delay_avg=( + accumulator.scheduler_metrics.processed_delay.mean or -1.0 + ), + ) + + +class GenerativeMetricsSummary(StandardBaseDict): + """ + Statistical summaries for input, output, and total metrics. + + Provides distribution summaries across successful, incomplete, and errored + requests for absolute values, per-second rates, and concurrency levels. + """ + + input: StatusDistributionSummary | None = Field( + description="Distribution of input metric values" + ) + input_per_second: StatusDistributionSummary | None = Field( + description="Distribution of input metric rates per second" + ) + input_concurrency: StatusDistributionSummary | None = Field( + description="Distribution of concurrent input metric values" + ) + + output: StatusDistributionSummary | None = Field( + description="Distribution of output metric values" + ) + output_per_second: StatusDistributionSummary | None = Field( + description="Distribution of output metric rates per second" + ) + output_concurrency: StatusDistributionSummary | None = Field( + description="Distribution of concurrent output metric values" + ) + + total: StatusDistributionSummary | None = Field( + description="Distribution of total metric values (input + output)" + ) + total_per_second: StatusDistributionSummary | None = Field( + description="Distribution of total metric rates per second" + ) + total_concurrency: StatusDistributionSummary | None = Field( + description="Distribution of concurrent total metric values" + ) + + @classmethod + def compile( + cls, + property_name: str, + successful: list[GenerativeRequestStats], + incomplete: list[GenerativeRequestStats], + errored: list[GenerativeRequestStats], + ) -> GenerativeMetricsSummary | None: + """ + Compile metrics summary from request statistics for a specific property. + + :param property_name: Name of the property to extract from request metrics + :param successful: Successfully completed request statistics + :param incomplete: Incomplete or cancelled request statistics + :param errored: Failed request statistics + :return: Compiled metrics summary or None if no data available + """ + successful_metrics = cls.extract_property_metrics_for_summary( + successful, property_name + ) + incomplete_metrics = cls.extract_property_metrics_for_summary( + incomplete, property_name + ) + errored_metrics = cls.extract_property_metrics_for_summary( + errored, property_name + ) + + return cls.compile_timed_metrics( + successful=successful_metrics, + incomplete=incomplete_metrics, + errored=errored_metrics, + ) + + @classmethod + def compile_timed_metrics( + cls, + successful: list[TimedMetricTypeAlias], + incomplete: list[TimedMetricTypeAlias], + errored: list[TimedMetricTypeAlias], + ) -> GenerativeMetricsSummary | None: + """ + Compile metrics summary from timed metric tuples. + + :param successful: Timed metrics from successful requests + :param incomplete: Timed metrics from incomplete requests + :param errored: Timed metrics from errored requests + :return: Compiled metrics summary or None if no data available + """ + + def _compile_metric_distributions( + metrics_by_status: dict[StatusTypes, list[TimedMetricTypeAlias]], + value_index: int, + ) -> tuple[ + StatusDistributionSummary | None, + StatusDistributionSummary | None, + StatusDistributionSummary | None, + dict[StatusTypes, list[float]], + dict[StatusTypes, list[tuple[float, float]]], + dict[StatusTypes, list[tuple[float, float, float]]], + ]: + """Helper to compile value, rate, and concurrency distributions.""" + value_lists: dict[StatusTypes, list[float]] = { + status: [ + float(metric[value_index] or 0.0) + for metric in metrics + if metric is not None + ] + for status, metrics in metrics_by_status.items() + } + value_dist = StatusDistributionSummary.from_values( + successful=value_lists["successful"], + incomplete=value_lists["incomplete"], + errored=value_lists["errored"], + ) + + if value_dist.total_sum == 0.0: + return None, None, None, value_lists, {}, {} + + rate_lists: dict[StatusTypes, list[tuple[float, float]]] = { + status: [ + ( # type: ignore[misc] + metric[_TIMED_METRIC_END_TIME_INDEX], + float(metric[value_index] or 0.0), + ) + for metric in metrics + if metric is not None + ] + for status, metrics in metrics_by_status.items() + } + rate_dist = StatusDistributionSummary.rate_distribution_from_timings( + successful=rate_lists["successful"], + incomplete=rate_lists["incomplete"], + errored=rate_lists["errored"], + ) + + concurrency_lists: dict[StatusTypes, list[tuple[float, float, float]]] = { + status: [ + ( # type: ignore[misc] + metric[_TIMED_METRIC_START_TIME_INDEX], + metric[_TIMED_METRIC_END_TIME_INDEX], + float(metric[value_index] or 0.0), + ) + for metric in metrics + if metric is not None + ] + for status, metrics in metrics_by_status.items() + } + concurrency_dist = ( + StatusDistributionSummary.concurrency_distribution_from_timings( + successful=concurrency_lists["successful"], + incomplete=concurrency_lists["incomplete"], + errored=concurrency_lists["errored"], + ) + ) + + return ( + value_dist, + rate_dist, + concurrency_dist, + value_lists, + rate_lists, + concurrency_lists, + ) + + metrics_by_status: dict[StatusTypes, list[TimedMetricTypeAlias]] = { + "successful": successful, + "incomplete": incomplete, + "errored": errored, + } + + # Calculate input distributions + ( + input_value_dist, + input_rate_dist, + input_concurrency_dist, + input_value_lists, + input_rate_lists, + input_concurrency_lists, + ) = _compile_metric_distributions( + metrics_by_status, _TIMED_METRIC_INPUT_VALUE_INDEX + ) + + # Calculate output distributions + ( + output_value_dist, + output_rate_dist, + output_concurrency_dist, + output_value_lists, + output_rate_lists, + output_concurrency_lists, + ) = _compile_metric_distributions( + metrics_by_status, _TIMED_METRIC_OUTPUT_VALUE_INDEX + ) + + # Calculate total distributions if both input and output have data + if input_value_dist is not None and output_value_dist is not None: + total_value_dist = StatusDistributionSummary.from_values( + successful=( + input_value_lists["successful"] + output_value_lists["successful"] + ), + incomplete=( + input_value_lists["incomplete"] + output_value_lists["incomplete"] + ), + errored=input_value_lists["errored"] + output_value_lists["errored"], + ) + total_rate_dist = StatusDistributionSummary.rate_distribution_from_timings( + successful=( + input_rate_lists["successful"] + output_rate_lists["successful"] + ), + incomplete=( + input_rate_lists["incomplete"] + output_rate_lists["incomplete"] + ), + errored=input_rate_lists["errored"] + output_rate_lists["errored"], + ) + total_concurrency_dist = ( + StatusDistributionSummary.concurrency_distribution_from_timings( + successful=( + input_concurrency_lists["successful"] + + output_concurrency_lists["successful"] + ), + incomplete=( + input_concurrency_lists["incomplete"] + + output_concurrency_lists["incomplete"] + ), + errored=( + input_concurrency_lists["errored"] + + output_concurrency_lists["errored"] + ), + ) + ) + else: + total_value_dist = None + total_rate_dist = None + total_concurrency_dist = None + + return GenerativeMetricsSummary( + input=input_value_dist, + input_per_second=input_rate_dist, + input_concurrency=input_concurrency_dist, + output=output_value_dist, + output_per_second=output_rate_dist, + output_concurrency=output_concurrency_dist, + total=total_value_dist, + total_per_second=total_rate_dist, + total_concurrency=total_concurrency_dist, + ) + + @classmethod + def extract_property_metrics_for_summary( + cls, stats_list: list[GenerativeRequestStats], property_name: str + ) -> list[TimedMetricTypeAlias]: + """ + Extract timed metrics for a specific property from request statistics. + + :param stats_list: List of request statistics to extract from + :param property_name: Name of the property to extract from metrics + :return: List of tuples containing + (start_time, end_time, input_value, output_value) + """ + return [ + ( + stats.request_start_time, + stats.request_end_time, + getattr(stats.input_metrics, property_name), + getattr(stats.output_metrics, property_name), + ) + for stats in stats_list + if ( + stats.request_start_time + and stats.request_end_time + and ( + getattr(stats.input_metrics, property_name) is not None + or getattr(stats.output_metrics, property_name) is not None + ) + ) + ] + + +class GenerativeTextMetricsSummary(StandardBaseDict): + """ + Text-specific metric summaries for generative benchmarks. + + Tracks token, word, and character-level metrics across input, output, and + total usage for text generation workloads. + """ + + tokens: GenerativeMetricsSummary | None = Field( + description="Token count metrics and distributions" + ) + words: GenerativeMetricsSummary | None = Field( + description="Word count metrics and distributions" + ) + characters: GenerativeMetricsSummary | None = Field( + description="Character count metrics and distributions" + ) + + @classmethod + def compile( + cls, + successful: list[GenerativeRequestStats], + incomplete: list[GenerativeRequestStats], + errored: list[GenerativeRequestStats], + ) -> GenerativeTextMetricsSummary: + """ + Compile text metrics summary from request statistics. + + :param successful: Successfully completed request statistics + :param incomplete: Incomplete/cancelled request statistics + :param errored: Failed request statistics + :return: Compiled text metrics summary + """ + return GenerativeTextMetricsSummary( + tokens=GenerativeMetricsSummary.compile( + property_name="text_tokens", + successful=successful, + incomplete=incomplete, + errored=errored, + ), + words=GenerativeMetricsSummary.compile( + property_name="text_words", + successful=successful, + incomplete=incomplete, + errored=errored, + ), + characters=GenerativeMetricsSummary.compile( + property_name="text_characters", + successful=successful, + incomplete=incomplete, + errored=errored, + ), + ) + + +class GenerativeImageMetricsSummary(StandardBaseDict): + """ + Image-specific metric summaries for generative benchmarks. + + Tracks token, image count, pixel, and byte-level metrics across input, output, + and total usage for image generation workloads. + """ + + tokens: GenerativeMetricsSummary | None = Field( + description="Image token count metrics and distributions" + ) + images: GenerativeMetricsSummary | None = Field( + description="Image count metrics and distributions" + ) + pixels: GenerativeMetricsSummary | None = Field( + description="Pixel count metrics and distributions" + ) + bytes: GenerativeMetricsSummary | None = Field( + description="Byte size metrics and distributions" + ) + + @classmethod + def compile( + cls, + successful: list[GenerativeRequestStats], + incomplete: list[GenerativeRequestStats], + errored: list[GenerativeRequestStats], + ) -> GenerativeImageMetricsSummary: + """ + Compile image metrics summary from request statistics. + + :param successful: Successfully completed request statistics + :param incomplete: Incomplete/cancelled request statistics + :param errored: Failed request statistics + :return: Compiled image metrics summary + """ + return GenerativeImageMetricsSummary( + tokens=GenerativeMetricsSummary.compile( + property_name="image_tokens", + successful=successful, + incomplete=incomplete, + errored=errored, + ), + images=GenerativeMetricsSummary.compile( + property_name="image_count", + successful=successful, + incomplete=incomplete, + errored=errored, + ), + pixels=GenerativeMetricsSummary.compile( + property_name="image_pixels", + successful=successful, + incomplete=incomplete, + errored=errored, + ), + bytes=GenerativeMetricsSummary.compile( + property_name="image_bytes", + successful=successful, + incomplete=incomplete, + errored=errored, + ), + ) + + +class GenerativeVideoMetricsSummary(StandardBaseDict): + """ + Video-specific metric summaries for generative benchmarks. + + Tracks token, frame count, duration, and byte-level metrics across input, + output, and total usage for video generation workloads. + """ + + tokens: GenerativeMetricsSummary | None = Field( + description="Video token count metrics and distributions" + ) + frames: GenerativeMetricsSummary | None = Field( + description="Frame count metrics and distributions" + ) + seconds: GenerativeMetricsSummary | None = Field( + description="Duration metrics in seconds and distributions" + ) + bytes: GenerativeMetricsSummary | None = Field( + description="Byte size metrics and distributions" + ) + + @classmethod + def compile( + cls, + successful: list[GenerativeRequestStats], + incomplete: list[GenerativeRequestStats], + errored: list[GenerativeRequestStats], + ) -> GenerativeVideoMetricsSummary: + """ + Compile video metrics summary from request statistics. + + :param successful: Successfully completed request statistics + :param incomplete: Incomplete/cancelled request statistics + :param errored: Failed request statistics + :return: Compiled video metrics summary + """ + return GenerativeVideoMetricsSummary( + tokens=GenerativeMetricsSummary.compile( + property_name="video_tokens", + successful=successful, + incomplete=incomplete, + errored=errored, + ), + frames=GenerativeMetricsSummary.compile( + property_name="video_frames", + successful=successful, + incomplete=incomplete, + errored=errored, + ), + seconds=GenerativeMetricsSummary.compile( + property_name="video_seconds", + successful=successful, + incomplete=incomplete, + errored=errored, + ), + bytes=GenerativeMetricsSummary.compile( + property_name="video_bytes", + successful=successful, + incomplete=incomplete, + errored=errored, + ), + ) + + +class GenerativeAudioMetricsSummary(StandardBaseDict): + """ + Audio-specific metric summaries for generative benchmarks. + + Tracks token, sample count, duration, and byte-level metrics across input, + output, and total usage for audio generation workloads. + """ + + tokens: GenerativeMetricsSummary | None = Field( + description="Audio token count metrics and distributions" + ) + samples: GenerativeMetricsSummary | None = Field( + description="Sample count metrics and distributions" + ) + seconds: GenerativeMetricsSummary | None = Field( + description="Duration metrics in seconds and distributions" + ) + bytes: GenerativeMetricsSummary | None = Field( + description="Byte size metrics and distributions" + ) + + @classmethod + def compile( + cls, + successful: list[GenerativeRequestStats], + incomplete: list[GenerativeRequestStats], + errored: list[GenerativeRequestStats], + ) -> GenerativeAudioMetricsSummary: + """ + Compile audio metrics summary from request statistics. + + :param successful: Successfully completed request statistics + :param incomplete: Incomplete/cancelled request statistics + :param errored: Failed request statistics + :return: Compiled audio metrics summary + """ + return GenerativeAudioMetricsSummary( + tokens=GenerativeMetricsSummary.compile( + property_name="audio_tokens", + successful=successful, + incomplete=incomplete, + errored=errored, + ), + samples=GenerativeMetricsSummary.compile( + property_name="audio_samples", + successful=successful, + incomplete=incomplete, + errored=errored, + ), + seconds=GenerativeMetricsSummary.compile( + property_name="audio_seconds", + successful=successful, + incomplete=incomplete, + errored=errored, + ), + bytes=GenerativeMetricsSummary.compile( + property_name="audio_bytes", + successful=successful, + incomplete=incomplete, + errored=errored, + ), + ) + + +class GenerativeMetrics(StandardBaseDict): + """ + Comprehensive metrics for generative AI benchmarks. + + Aggregates request statistics, token metrics, timing distributions, and + domain-specific measurements across text, image, video, and audio modalities. + Provides detailed statistical summaries including distribution analysis for + throughput, latency, concurrency, and resource utilization metrics across + successful, incomplete, and errored requests. + """ + + # Request stats + request_totals: StatusBreakdown[int, int, int, int] = Field( + description="Request counts by status: successful, incomplete, errored, total" + ) + requests_per_second: StatusDistributionSummary = Field( + description="Distribution of requests per second across benchmark execution" + ) + request_concurrency: StatusDistributionSummary = Field( + description="Distribution of concurrent request counts during execution" + ) + request_latency: StatusDistributionSummary = Field( + description="Distribution of request latencies for completed requests" + ) + request_streaming_iterations_count: StatusDistributionSummary = Field( + description="Distribution of stream iterations for completed requests" + ) + + # General token stats + prompt_token_count: StatusDistributionSummary = Field( + description="Distribution of prompt token counts by request status" + ) + output_token_count: StatusDistributionSummary = Field( + description="Distribution of output token counts by request status" + ) + total_token_count: StatusDistributionSummary = Field( + description="Distribution of total token counts by request status" + ) + time_to_first_token_ms: StatusDistributionSummary = Field( + description="Distribution of first token latencies in milliseconds" + ) + time_per_output_token_ms: StatusDistributionSummary = Field( + description="Distribution of average time per output token in milliseconds" + ) + inter_token_latency_ms: StatusDistributionSummary = Field( + description="Distribution of inter-token latencies in milliseconds" + ) + prompt_tokens_per_second: StatusDistributionSummary = Field( + description="Distribution of prompt token processing rates" + ) + output_tokens_per_second: StatusDistributionSummary = Field( + description="Distribution of output token generation rates" + ) + tokens_per_second: StatusDistributionSummary = Field( + description="Distribution of total token throughput including prompt and output" + ) + output_tokens_per_iteration: StatusDistributionSummary = Field( + description="Distribution of output tokens generated per streaming iteration" + ) + iter_tokens_per_iteration: StatusDistributionSummary = Field( + description=( + "Distribution of output tokens (without first) generated per " + "streaming iteration" + ) + ) + + # Domain specific stats + text: GenerativeTextMetricsSummary = Field( + description="Text-specific metrics for tokens, words, and characters" + ) + image: GenerativeImageMetricsSummary = Field( + description="Image-specific metrics for tokens, images, pixels, and bytes" + ) + video: GenerativeVideoMetricsSummary = Field( + description="Video-specific metrics for tokens, frames, duration, and bytes" + ) + audio: GenerativeAudioMetricsSummary = Field( + description="Audio-specific metrics for tokens, samples, duration, and bytes" + ) + + @classmethod + def compile(cls, accumulator: GenerativeBenchmarkAccumulator) -> GenerativeMetrics: + """ + Compile comprehensive generative metrics from benchmark accumulator. + + :param accumulator: Benchmark accumulator with completed request statistics + :return: Compiled generative metrics with all distributions and summaries + :raises ValueError: If measure_start and measure_end/request_end are not set + """ + if (start_time := accumulator.timings.measure_start) is None or ( + end_time := accumulator.timings.measure_end + or accumulator.timings.request_end + ) is None: + raise ValueError( + "Cannot compile GenerativeMetrics: " + "measure_start and measure_end/request_end must be set" + ) + + successful = accumulator.completed.get_within_range(start_time, end_time) + incomplete = accumulator.incomplete.get_within_range(start_time, end_time) + errored = accumulator.errored.get_within_range(start_time, end_time) + + return GenerativeMetrics( + # Request stats + request_totals=StatusBreakdown( + successful=len(successful), + incomplete=len(incomplete), + errored=len(errored), + total=(len(successful) + len(incomplete) + len(errored)), + ), + requests_per_second=StatusDistributionSummary.rate_distribution_from_timings_function( + function=lambda req: req.request_end_time, + successful=successful, + incomplete=incomplete, + errored=errored, + start_time=start_time, + end_time=end_time, + ), + request_concurrency=StatusDistributionSummary.concurrency_distribution_from_timings_function( + function=( + lambda req: (req.request_start_time, req.request_end_time) + if req.request_start_time is not None + and req.request_end_time is not None + else None + ), + successful=successful, + incomplete=incomplete, + errored=errored, + start_time=start_time, + end_time=end_time, + ), + request_latency=StatusDistributionSummary.from_values_function( + function=lambda req: req.request_latency or 0.0, + successful=successful, + incomplete=incomplete, + errored=errored, + ), + request_streaming_iterations_count=StatusDistributionSummary.from_values_function( + function=lambda req: req.info.timings.request_iterations or 0.0, + successful=successful, + incomplete=incomplete, + errored=errored, + ), + # General token stats + prompt_token_count=StatusDistributionSummary.from_values_function( + function=lambda req: req.prompt_tokens or 0.0, + successful=successful, + incomplete=incomplete, + errored=errored, + ), + output_token_count=StatusDistributionSummary.from_values_function( + function=lambda req: req.output_tokens or 0.0, + successful=successful, + incomplete=incomplete, + errored=errored, + ), + total_token_count=StatusDistributionSummary.from_values_function( + function=lambda req: req.total_tokens or 0.0, + successful=successful, + incomplete=incomplete, + errored=errored, + ), + time_to_first_token_ms=StatusDistributionSummary.from_values_function( + function=lambda req: req.time_to_first_token_ms or 0.0, + successful=successful, + incomplete=incomplete, + errored=errored, + ), + time_per_output_token_ms=StatusDistributionSummary.from_values_function( + function=lambda req: ( + req.time_per_output_token_ms or 0.0, + req.output_tokens or 0.0, + ), + successful=successful, + incomplete=incomplete, + errored=errored, + ), + inter_token_latency_ms=StatusDistributionSummary.from_values_function( + function=lambda req: ( + req.inter_token_latency_ms or 0.0, + (req.output_tokens or 1.0) - 1.0, + ), + successful=successful, + incomplete=incomplete, + errored=errored, + ), + prompt_tokens_per_second=StatusDistributionSummary.rate_distribution_from_timings_function( + function=lambda req: req.prompt_tokens_timing, + successful=successful, + incomplete=incomplete, + errored=errored, + ), + output_tokens_per_second=StatusDistributionSummary.rate_distribution_from_timings_function( + function=lambda req: req.output_tokens_timings, + successful=successful, + incomplete=incomplete, + errored=errored, + ), + tokens_per_second=StatusDistributionSummary.rate_distribution_from_timings_function( + function=lambda req: req.total_tokens_timings, + successful=successful, + incomplete=incomplete, + errored=errored, + ), + output_tokens_per_iteration=StatusDistributionSummary.from_values_function( + function=lambda req: [ + tokens for (_timing, tokens) in req.output_tokens_timings + ], + successful=successful, + incomplete=incomplete, + errored=errored, + ), + iter_tokens_per_iteration=StatusDistributionSummary.from_values_function( + function=lambda req: [ + tokens for (_timing, tokens) in req.iter_tokens_timings + ], + successful=successful, + incomplete=incomplete, + errored=errored, + ), + # Domain-specific stats + text=GenerativeTextMetricsSummary.compile( + successful=successful, incomplete=incomplete, errored=errored + ), + image=GenerativeImageMetricsSummary.compile( + successful=successful, incomplete=incomplete, errored=errored + ), + video=GenerativeVideoMetricsSummary.compile( + successful=successful, incomplete=incomplete, errored=errored + ), + audio=GenerativeAudioMetricsSummary.compile( + successful=successful, incomplete=incomplete, errored=errored + ), + ) diff --git a/src/guidellm/benchmark/schemas/generative/report.py b/src/guidellm/benchmark/schemas/generative/report.py new file mode 100644 index 00000000..16cc654b --- /dev/null +++ b/src/guidellm/benchmark/schemas/generative/report.py @@ -0,0 +1,125 @@ +""" +Report container for multiple generative benchmark results with persistence. + +Provides data structures for aggregating multiple benchmark executions into a single +report with file I/O capabilities. Supports loading and saving benchmark collections +in JSON and YAML formats, enabling result persistence, sharing, and analysis across +different execution sessions. Core functionality includes benchmark grouping with +shared configuration parameters and flexible file path resolution. +""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import ClassVar, Literal + +import yaml +from pydantic import Field + +from guidellm.benchmark.schemas.generative.benchmark import GenerativeBenchmark +from guidellm.benchmark.schemas.generative.entrypoints import ( + BenchmarkGenerativeTextArgs, +) +from guidellm.schemas import StandardBaseModel + +__all__ = ["GenerativeBenchmarksReport"] + + +class GenerativeBenchmarksReport(StandardBaseModel): + """ + Container for multiple benchmark results with load/save functionality. + + Aggregates multiple generative benchmark executions into a single report, + providing persistence through JSON and YAML file formats. Enables result + collection, storage, and retrieval across different execution sessions with + automatic file type detection and path resolution. + + :cvar DEFAULT_FILE: Default filename used when saving to or loading from a directory + """ + + DEFAULT_FILE: ClassVar[str] = "benchmarks.json" + + args: BenchmarkGenerativeTextArgs = Field( + description="Benchmark arguments used for all benchmarks in the report" + ) + benchmarks: list[GenerativeBenchmark] = Field( + description="List of completed benchmarks in the report", + default_factory=list, + ) + + def save_file( + self, + path: str | Path | None = None, + type_: Literal["json", "yaml"] | None = None, + ) -> Path: + """ + Save report to file in JSON or YAML format. + + :param path: File path or directory for saving, defaults to current directory + with DEFAULT_FILE name + :param type_: File format override ('json' or 'yaml'), auto-detected from + extension if None + :return: Resolved path to the saved file + :raises ValueError: If file type is unsupported or cannot be determined + """ + file_path = GenerativeBenchmarksReport._resolve_path( + path if path is not None else Path.cwd() + ) + file_path.parent.mkdir(parents=True, exist_ok=True) + file_type = type_ or file_path.suffix.lower()[1:] + model_dict = self.model_dump() + + if file_type == "json": + save_str = json.dumps(model_dict) + elif file_type in ["yaml", "yml"]: + save_str = yaml.dump(model_dict) + else: + raise ValueError(f"Unsupported file type: {file_type} for {file_path}.") + + with file_path.open("w") as file: + file.write(save_str) + + return file_path + + @classmethod + def load_file( + cls, path: str | Path, type_: Literal["json", "yaml"] | None = None + ) -> GenerativeBenchmarksReport: + """ + Load report from JSON or YAML file. + + :param path: File path or directory containing DEFAULT_FILE to load from + :param type_: File format override ('json' or 'yaml'), auto-detected from + extension if None + :return: Loaded report instance with benchmarks and configuration + :raises ValueError: If file type is unsupported or cannot be determined + :raises FileNotFoundError: If specified file does not exist + """ + file_path = GenerativeBenchmarksReport._resolve_path(path) + file_type = type_ or file_path.suffix.lower()[1:] + + with file_path.open("r") as file: + if file_type == "json": + model_dict = json.loads(file.read()) + elif file_type in ["yaml", "yml"]: + model_dict = yaml.safe_load(file) + else: + raise ValueError(f"Unsupported file type: {file_type} for {file_path}.") + + return GenerativeBenchmarksReport.model_validate(model_dict) + + @classmethod + def _resolve_path(cls, path: str | Path) -> Path: + """ + Resolve input to file path, converting directories to DEFAULT_FILE location. + + :param path: String or Path to resolve, directories append DEFAULT_FILE + :return: Resolved file path + """ + resolved = Path(path) if not isinstance(path, Path) else path + + if resolved.is_dir(): + resolved = resolved / GenerativeBenchmarksReport.DEFAULT_FILE + + return resolved diff --git a/src/guidellm/data/__init__.py b/src/guidellm/data/__init__.py index 0bff1b64..9adbd3c8 100644 --- a/src/guidellm/data/__init__.py +++ b/src/guidellm/data/__init__.py @@ -9,6 +9,7 @@ DataDependentPreprocessor, DatasetPreprocessor, PreprocessorRegistry, + RequestFormatter, ) from .processor import ProcessorFactory from .schemas import GenerativeDatasetColumnType @@ -25,4 +26,5 @@ "GenerativeRequestCollator", "PreprocessorRegistry", "ProcessorFactory", + "RequestFormatter", ] diff --git a/src/guidellm/data/deserializers/synthetic.py b/src/guidellm/data/deserializers/synthetic.py index e1df911a..6e098462 100644 --- a/src/guidellm/data/deserializers/synthetic.py +++ b/src/guidellm/data/deserializers/synthetic.py @@ -17,7 +17,8 @@ DatasetDeserializer, DatasetDeserializerFactory, ) -from guidellm.utils import IntegerRangeSampler, StandardBaseModel +from guidellm.schemas import StandardBaseModel +from guidellm.utils import IntegerRangeSampler __all__ = [ "SyntheticTextDatasetConfig", diff --git a/src/guidellm/data/loaders.py b/src/guidellm/data/loaders.py index b4ee38da..4f96002e 100644 --- a/src/guidellm/data/loaders.py +++ b/src/guidellm/data/loaders.py @@ -2,7 +2,7 @@ import contextlib from collections.abc import Callable, Iterator -from typing import Any, Literal +from typing import Any, Literal, TypeVar import torch from torch.utils.data import Sampler @@ -17,7 +17,10 @@ __all__ = ["DataLoader", "DatasetsIterator"] -class DatasetsIterator(TorchIterableDataset): +DataT = TypeVar("DataT") + + +class DatasetsIterator(TorchIterableDataset[DataT]): def __init__( self, data: list[Any], @@ -60,7 +63,7 @@ def __init__( list(self.generator(data_samples)) if data_samples else None ) - def __iter__(self): + def __iter__(self) -> Iterator[DataT]: worker_info = torch.utils.data.get_worker_info() worker_modulus = worker_info.num_workers if worker_info is not None else 1 worker_index = worker_info.id if worker_info is not None else 0 @@ -77,7 +80,7 @@ def generator( max_items: int | None = None, modulus: int | None = None, offset: int | None = None, - ) -> Iterator[Any]: + ) -> Iterator[DataT]: gen_count = 0 with contextlib.suppress(StopIteration): @@ -102,7 +105,7 @@ def generator( # passed into the preprocessor, which is a type violation. # This should be fixed at some point. row = preprocessor(row) # type: ignore[assignment] - yield row + yield row # type: ignore[misc] except Exception as err: # noqa: BLE001 # Exception logged logger.error(f"Skipping data row due to error: {err}") gen_count -= 1 @@ -114,7 +117,7 @@ def generator( ) -class DataLoader(PyTorchDataLoader): +class DataLoader(PyTorchDataLoader[DataT]): def __init__( self, data: list[Any], @@ -128,7 +131,7 @@ def __init__( random_seed: int = 42, **kwargs: Any, ): - iterator = DatasetsIterator( + iterator: DatasetsIterator[DataT] = DatasetsIterator( data=data, data_args=data_args, data_samples=data_samples, diff --git a/src/guidellm/data/preprocessors/__init__.py b/src/guidellm/data/preprocessors/__init__.py index 664e196b..6d6e722d 100644 --- a/src/guidellm/data/preprocessors/__init__.py +++ b/src/guidellm/data/preprocessors/__init__.py @@ -3,6 +3,7 @@ GenerativeAudioTranslationRequestFormatter, GenerativeChatCompletionsRequestFormatter, GenerativeTextCompletionsRequestFormatter, + RequestFormatter, ) from .mappers import GenerativeColumnMapper from .preprocessor import ( @@ -22,4 +23,5 @@ "GenerativeColumnMapper", "GenerativeTextCompletionsRequestFormatter", "PreprocessorRegistry", + "RequestFormatter", ] diff --git a/src/guidellm/data/preprocessors/formatters.py b/src/guidellm/data/preprocessors/formatters.py index 5a869403..608128a6 100644 --- a/src/guidellm/data/preprocessors/formatters.py +++ b/src/guidellm/data/preprocessors/formatters.py @@ -1,6 +1,5 @@ from __future__ import annotations -from abc import ABCMeta from typing import Any from guidellm.data.preprocessors.preprocessor import ( @@ -14,10 +13,14 @@ "GenerativeAudioTranslationRequestFormatter", "GenerativeChatCompletionsRequestFormatter", "GenerativeTextCompletionsRequestFormatter", + "RequestFormatter", ] -class RequestFormatter(DatasetPreprocessor, metaclass=ABCMeta): +class RequestFormatter(DatasetPreprocessor): + def __init__(self, model: str, **_kwargs): + self.model = model + @staticmethod def encode_audio(*args, **kwargs): from guidellm.extras.audio import encode_audio @@ -47,7 +50,7 @@ def __init__( max_tokens: int | None = None, max_completion_tokens: int | None = None, ): - self.model: str | None = model + self.model: str = model self.extras = ( GenerationRequestArguments(**extras) if extras and isinstance(extras, dict) @@ -73,6 +76,7 @@ def __call__(self, columns: dict[str, list[Any]]) -> GenerationRequest: if self.stream: arguments.stream = True arguments.body["stream"] = True + arguments.body["stream_options"] = {"include_usage": True} # Handle output tokens if output_tokens := sum( @@ -158,9 +162,8 @@ def __call__( # noqa: C901, PLR0912, PLR0915 # Configure streaming if self.stream: arguments.stream = True - arguments.body.update( - {"stream": True, "stream_options": {"include_usage": True}} - ) + arguments.body["stream"] = True + arguments.body["stream_options"] = {"include_usage": True} # Handle output tokens if output_tokens := sum( @@ -334,6 +337,7 @@ def __call__( # noqa: C901 if self.stream: arguments.stream = True arguments.body["stream"] = True + arguments.body["stream_options"] = {"include_usage": True} # Handle output tokens if output_tokens := sum( diff --git a/src/guidellm/data/preprocessors/preprocessor.py b/src/guidellm/data/preprocessors/preprocessor.py index e95ad75d..43fe20e9 100644 --- a/src/guidellm/data/preprocessors/preprocessor.py +++ b/src/guidellm/data/preprocessors/preprocessor.py @@ -25,6 +25,6 @@ def setup_data( class PreprocessorRegistry( - RegistryMixin[DataDependentPreprocessor | type[DataDependentPreprocessor]] + RegistryMixin[type[DatasetPreprocessor] | type[DataDependentPreprocessor]] ): pass diff --git a/src/guidellm/data/processor.py b/src/guidellm/data/processor.py index 7962bfbf..e55eb123 100644 --- a/src/guidellm/data/processor.py +++ b/src/guidellm/data/processor.py @@ -1,11 +1,9 @@ from __future__ import annotations +from pathlib import Path from typing import Any -from transformers import ( # type: ignore[import] - AutoTokenizer, - PreTrainedTokenizerBase, -) +from transformers import AutoTokenizer, PreTrainedTokenizerBase # type: ignore[import] __all__ = ["ProcessorFactory"] @@ -13,7 +11,7 @@ class ProcessorFactory: def __init__( self, - processor: str | PreTrainedTokenizerBase, + processor: str | Path | PreTrainedTokenizerBase, processor_args: dict[str, Any] | None = None, ) -> None: self.processor = processor diff --git a/src/guidellm/mock_server/server.py b/src/guidellm/mock_server/server.py index ff9d5fcd..e85c6134 100644 --- a/src/guidellm/mock_server/server.py +++ b/src/guidellm/mock_server/server.py @@ -11,12 +11,13 @@ from __future__ import annotations import time +from typing import Any from sanic import Sanic, response from sanic.exceptions import NotFound from sanic.log import logger from sanic.request import Request -from sanic.response import HTTPResponse +from sanic.response import BaseHTTPResponse, HTTPResponse from guidellm.mock_server.config import MockServerConfig from guidellm.mock_server.handlers import ( @@ -65,16 +66,20 @@ def _setup_middleware(self): """Setup middleware for CORS, logging, etc.""" @self.app.middleware("request") - async def add_cors_headers(_request: Request): + async def add_cors_headers(_request: Request) -> None: """Add CORS headers to all requests.""" + return None # noqa: RET501 @self.app.middleware("response") - async def add_response_headers(_request: Request, resp: HTTPResponse): + async def add_response_headers( + _request: Any, resp: BaseHTTPResponse + ) -> HTTPResponse: """Add standard response headers.""" resp.headers["Access-Control-Allow-Origin"] = "*" resp.headers["Access-Control-Allow-Methods"] = "GET, POST, OPTIONS" resp.headers["Access-Control-Allow-Headers"] = "Content-Type, Authorization" resp.headers["Server"] = "guidellm-mock-server" + return resp # type: ignore[return-value] def _setup_routes(self): # noqa: C901 @self.app.get("/health") diff --git a/src/guidellm/preprocess/dataset.py b/src/guidellm/preprocess/dataset.py index 49ce7b09..033bf106 100644 --- a/src/guidellm/preprocess/dataset.py +++ b/src/guidellm/preprocess/dataset.py @@ -276,8 +276,8 @@ def process_dataset( processor_args, "dataset conversion.", ) - prompt_column = column_mappings.get("prompt_column") - output_column = column_mappings.get( + prompt_column = column_mappings.get("prompt_column") # type: ignore[attr-defined] + output_column = column_mappings.get( # type: ignore[attr-defined] "output_tokens_count_column", "output_tokens_count" ) @@ -304,7 +304,7 @@ def process_dataset( ) ) - dataset_iterator = iter(dataset) + dataset_iterator = iter(dataset) # type: ignore[call-overload] processed_prompts = [] prompt_handler = STRATEGY_HANDLERS[short_prompt_strategy] diff --git a/src/guidellm/presentation/__init__.py b/src/guidellm/presentation/__init__.py deleted file mode 100644 index 872188db..00000000 --- a/src/guidellm/presentation/__init__.py +++ /dev/null @@ -1,28 +0,0 @@ -from .builder import UIDataBuilder -from .data_models import ( - BenchmarkDatum, - Bucket, - Dataset, - Distribution, - Model, - RunInfo, - Server, - TokenDetails, - WorkloadDetails, -) -from .injector import create_report, inject_data - -__all__ = [ - "BenchmarkDatum", - "Bucket", - "Dataset", - "Distribution", - "Model", - "RunInfo", - "Server", - "TokenDetails", - "UIDataBuilder", - "WorkloadDetails", - "create_report", - "inject_data", -] diff --git a/src/guidellm/presentation/builder.py b/src/guidellm/presentation/builder.py deleted file mode 100644 index 6ea9c5c3..00000000 --- a/src/guidellm/presentation/builder.py +++ /dev/null @@ -1,27 +0,0 @@ -from typing import TYPE_CHECKING, Any - -if TYPE_CHECKING: - from guidellm.benchmark import GenerativeBenchmark - -from guidellm.presentation.data_models import BenchmarkDatum, RunInfo, WorkloadDetails - - -class UIDataBuilder: - def __init__(self, benchmarks: list["GenerativeBenchmark"]): - self.benchmarks = benchmarks - - def build_run_info(self): - return RunInfo.from_benchmarks(self.benchmarks) - - def build_workload_details(self): - return WorkloadDetails.from_benchmarks(self.benchmarks) - - def build_benchmarks(self): - return [BenchmarkDatum.from_benchmark(b) for b in self.benchmarks] - - def to_dict(self) -> dict[str, Any]: - return { - "run_info": self.build_run_info().model_dump(), - "workload_details": self.build_workload_details().model_dump(), - "benchmarks": [b.model_dump() for b in self.build_benchmarks()], - } diff --git a/src/guidellm/presentation/data_models.py b/src/guidellm/presentation/data_models.py deleted file mode 100644 index deec925c..00000000 --- a/src/guidellm/presentation/data_models.py +++ /dev/null @@ -1,236 +0,0 @@ -import random -from collections import defaultdict -from math import ceil -from typing import TYPE_CHECKING - -from pydantic import BaseModel, computed_field - -if TYPE_CHECKING: - from guidellm.benchmark import GenerativeBenchmark - -from guidellm.utils import DistributionSummary - - -class Bucket(BaseModel): - value: float | int - count: int - - @staticmethod - def from_data( - data: list[float] | list[int], - bucket_width: float | None = None, - n_buckets: int | None = None, - ) -> tuple[list["Bucket"], float]: - if not data: - return [], 1.0 - - min_v = min(data) - max_v = max(data) - range_v = (1 + max_v) - min_v - - if bucket_width is None: - if n_buckets is None: - n_buckets = 10 - bucket_width = range_v / n_buckets - else: - n_buckets = ceil(range_v / bucket_width) - - bucket_counts: defaultdict[float | int, int] = defaultdict(int) - for val in data: - idx = int((val - min_v) // bucket_width) - if idx >= n_buckets: - idx = n_buckets - 1 - bucket_start = min_v + idx * bucket_width - bucket_counts[bucket_start] += 1 - - buckets = [ - Bucket(value=start, count=count) - for start, count in sorted(bucket_counts.items()) - ] - return buckets, bucket_width - - -class Model(BaseModel): - name: str - size: int - - -class Dataset(BaseModel): - name: str - - -class RunInfo(BaseModel): - model: Model - task: str - timestamp: float - dataset: Dataset - - @classmethod - def from_benchmarks(cls, benchmarks: list["GenerativeBenchmark"]): - model = benchmarks[0].benchmarker.backend.get("model", "N/A") - timestamp = max( - bm.run_stats.start_time for bm in benchmarks if bm.start_time is not None - ) - return cls( - model=Model(name=model or "", size=0), - task="N/A", - timestamp=timestamp, - dataset=Dataset(name="N/A"), - ) - - -class Distribution(BaseModel): - statistics: DistributionSummary | None = None - buckets: list[Bucket] - bucket_width: float - - -class TokenDetails(BaseModel): - samples: list[str] - token_distributions: Distribution - - -class Server(BaseModel): - target: str - - -class RequestOverTime(BaseModel): - num_benchmarks: int - requests_over_time: Distribution - - -class WorkloadDetails(BaseModel): - prompts: TokenDetails - generations: TokenDetails - requests_over_time: RequestOverTime - rate_type: str - server: Server - - @classmethod - def from_benchmarks(cls, benchmarks: list["GenerativeBenchmark"]): - target = benchmarks[0].benchmarker.backend.get("target", "N/A") - rate_type = benchmarks[0].scheduler.strategy.type_ - successful_requests = [ - req for bm in benchmarks for req in bm.requests.successful - ] - sample_indices = random.sample( - range(len(successful_requests)), min(5, len(successful_requests)) - ) - sample_prompts = [ - req.request_args.replace("\n", " ").replace('"', "'") - if (req := successful_requests[i]).request_args - else "" - for i in sample_indices - ] - sample_outputs = [ - req.output.replace("\n", " ").replace('"', "'") - if (req := successful_requests[i]).output - else "" - for i in sample_indices - ] - - prompt_tokens = [ - float(req.prompt_tokens) if req.prompt_tokens is not None else -1 - for bm in benchmarks - for req in bm.requests.successful - ] - output_tokens = [ - float(req.output_tokens) if req.output_tokens is not None else -1 - for bm in benchmarks - for req in bm.requests.successful - ] - - prompt_token_buckets, _prompt_token_bucket_width = Bucket.from_data( - prompt_tokens, 1 - ) - output_token_buckets, _output_token_bucket_width = Bucket.from_data( - output_tokens, 1 - ) - - prompt_token_stats = DistributionSummary.from_values(prompt_tokens) - output_token_stats = DistributionSummary.from_values(output_tokens) - prompt_token_distributions = Distribution( - statistics=prompt_token_stats, buckets=prompt_token_buckets, bucket_width=1 - ) - output_token_distributions = Distribution( - statistics=output_token_stats, buckets=output_token_buckets, bucket_width=1 - ) - - min_start_time = benchmarks[0].start_time - - all_req_times = [ - req.info.timings.request_start - min_start_time - for bm in benchmarks - for req in bm.requests.successful - if req.info.timings.request_start is not None - ] - number_of_buckets = len(benchmarks) - request_over_time_buckets, bucket_width = Bucket.from_data( - all_req_times, None, number_of_buckets - ) - request_over_time_distribution = Distribution( - buckets=request_over_time_buckets, bucket_width=bucket_width - ) - return cls( - prompts=TokenDetails( - samples=sample_prompts, token_distributions=prompt_token_distributions - ), - generations=TokenDetails( - samples=sample_outputs, token_distributions=output_token_distributions - ), - requests_over_time=RequestOverTime( - requests_over_time=request_over_time_distribution, - num_benchmarks=number_of_buckets, - ), - rate_type=rate_type, - server=Server(target=target), - ) - - -class TabularDistributionSummary(DistributionSummary): - """ - Same fields as `DistributionSummary`, but adds a ready-to-serialize/iterate - `percentile_rows` helper. - """ - - @computed_field - def percentile_rows(self) -> list[dict[str, str | float]]: - rows = [ - {"percentile": name, "value": value} - for name, value in self.percentiles.model_dump().items() - ] - return list( - filter(lambda row: row["percentile"] in ["p50", "p90", "p95", "p99"], rows) - ) - - @classmethod - def from_distribution_summary( - cls, distribution: DistributionSummary - ) -> "TabularDistributionSummary": - return cls(**distribution.model_dump()) - - -class BenchmarkDatum(BaseModel): - requests_per_second: float - itl: TabularDistributionSummary - ttft: TabularDistributionSummary - throughput: TabularDistributionSummary - time_per_request: TabularDistributionSummary - - @classmethod - def from_benchmark(cls, bm: "GenerativeBenchmark"): - return cls( - requests_per_second=bm.metrics.requests_per_second.successful.mean, - itl=TabularDistributionSummary.from_distribution_summary( - bm.metrics.inter_token_latency_ms.successful - ), - ttft=TabularDistributionSummary.from_distribution_summary( - bm.metrics.time_to_first_token_ms.successful - ), - throughput=TabularDistributionSummary.from_distribution_summary( - bm.metrics.output_tokens_per_second.successful - ), - time_per_request=TabularDistributionSummary.from_distribution_summary( - bm.metrics.request_latency.successful - ), - ) diff --git a/src/guidellm/presentation/injector.py b/src/guidellm/presentation/injector.py deleted file mode 100644 index 1e78080e..00000000 --- a/src/guidellm/presentation/injector.py +++ /dev/null @@ -1,65 +0,0 @@ -import re -from pathlib import Path - -from loguru import logger - -from guidellm.settings import settings -from guidellm.utils.text import load_text - - -def create_report(js_data: dict, output_path: str | Path) -> Path: - """ - Creates a report from the dictionary and saves it to the output path. - - :param js_data: dict with match str and json data to inject - :type js_data: dict - :param output_path: the file to save the report to. - :type output_path: str - :return: the path to the saved report - :rtype: str - """ - - if not isinstance(output_path, Path): - output_path = Path(output_path) - - html_content = load_text(settings.report_generation.source) - report_content = inject_data( - js_data, - html_content, - ) - - output_path.parent.mkdir(parents=True, exist_ok=True) - output_path.write_text(report_content) - return output_path - - -def inject_data( - js_data: dict, - html: str, -) -> str: - """ - Injects the json data into the HTML, - replacing placeholders only within the section. - - :param js_data: the json data to inject - :type js_data: dict - :param html: the html to inject the data into - :type html: str - :return: the html with the json data injected - :rtype: str - """ - head_match = re.search(r"]*>(.*?)", html, re.DOTALL | re.IGNORECASE) - if not head_match: - logger.warning(" section missing, returning original HTML.") - - return html - - head_content = head_match.group(1) - - # Replace placeholders only inside the content - for placeholder, script in js_data.items(): - head_content = head_content.replace(placeholder, script) - - # Rebuild the HTML - new_head = f"{head_content}" - return html[: head_match.start()] + new_head + html[head_match.end() :] diff --git a/src/guidellm/scheduler/constraints.py b/src/guidellm/scheduler/constraints.py index e24419ea..bbf34fb4 100644 --- a/src/guidellm/scheduler/constraints.py +++ b/src/guidellm/scheduler/constraints.py @@ -21,9 +21,9 @@ SchedulerUpdateAction, SchedulerUpdateActionProgress, ) -from guidellm.schemas import RequestInfo +from guidellm.schemas import RequestInfo, StandardBaseModel from guidellm.settings import settings -from guidellm.utils import InfoMixin, RegistryMixin, StandardBaseModel +from guidellm.utils import InfoMixin, RegistryMixin __all__ = [ "Constraint", diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py index 6da76438..1b5e28f6 100644 --- a/src/guidellm/scheduler/scheduler.py +++ b/src/guidellm/scheduler/scheduler.py @@ -24,7 +24,7 @@ from guidellm.scheduler.strategies import SchedulingStrategy from guidellm.scheduler.worker_group import WorkerProcessGroup from guidellm.schemas import RequestInfo -from guidellm.utils.singleton import ThreadSafeSingletonMixin +from guidellm.utils import ThreadSafeSingletonMixin __all__ = ["Scheduler"] diff --git a/src/guidellm/scheduler/schemas.py b/src/guidellm/scheduler/schemas.py index 21567c67..b202b010 100644 --- a/src/guidellm/scheduler/schemas.py +++ b/src/guidellm/scheduler/schemas.py @@ -16,8 +16,8 @@ from pydantic import Field from typing_extensions import TypeAliasType, TypedDict -from guidellm.schemas import RequestInfo -from guidellm.utils import RegistryMixin, StandardBaseModel +from guidellm.schemas import RequestInfo, StandardBaseModel +from guidellm.utils import RegistryMixin from guidellm.utils.registry import RegistryObjT __all__ = [ diff --git a/src/guidellm/scheduler/strategies.py b/src/guidellm/scheduler/strategies.py index e1473b93..9fce40ab 100644 --- a/src/guidellm/scheduler/strategies.py +++ b/src/guidellm/scheduler/strategies.py @@ -20,8 +20,8 @@ from pydantic import Field, PrivateAttr -from guidellm.schemas import RequestInfo -from guidellm.utils import InfoMixin, PydanticClassRegistryMixin +from guidellm.schemas import PydanticClassRegistryMixin, RequestInfo +from guidellm.utils import InfoMixin __all__ = [ "AsyncConstantStrategy", diff --git a/src/guidellm/scheduler/worker.py b/src/guidellm/scheduler/worker.py index 977635fa..6f37b1da 100644 --- a/src/guidellm/scheduler/worker.py +++ b/src/guidellm/scheduler/worker.py @@ -408,9 +408,8 @@ async def _dequeue_next_request( async def _schedule_request( self, request: RequestT, request_info: RequestInfo, target_start: float ): - current_time = time.time() - request_info.timings.scheduled_at = current_time - if target_start > current_time: + request_info.timings.scheduled_at = request_info.timings.dequeued + if target_start > (current_time := time.time()): await asyncio.sleep(target_start - current_time) # Adapt delay so that scheduled at reflects the sleep time request_info.timings.scheduled_at = target_start diff --git a/src/guidellm/scheduler/worker_group.py b/src/guidellm/scheduler/worker_group.py index 2a0a51de..d30403a6 100644 --- a/src/guidellm/scheduler/worker_group.py +++ b/src/guidellm/scheduler/worker_group.py @@ -228,11 +228,11 @@ async def create_processes(self): worker = WorkerProcess[RequestT, ResponseT]( worker_index=rank, - messaging=self.messaging.create_worker_copy( + messaging=self.messaging.create_worker_copy( # type: ignore[arg-type] worker_index=rank, max_buffer_send_size=None, max_buffer_receive_size=per_proc_max_buffer_size, - ), # The non-group worker lacks the SchedulerState type. Type err. + ), backend=self.backend, strategy=self.strategy, async_limit=async_limit, @@ -632,6 +632,8 @@ def _locked_update( ) def _update_state_request_counts(self, info: RequestInfo): + finalized = time.time() + if info.status == "queued": self._queued_request_ids.add(info.request_id) self._state.queued_requests = len(self._queued_request_ids) @@ -647,11 +649,13 @@ def _update_state_request_counts(self, info: RequestInfo): self._processing_request_ids.add(info.request_id) self._state.processing_requests = len(self._processing_request_ids) elif info.status == "completed": + info.timings.finalized = finalized self._processing_request_ids.remove(info.request_id) self._state.processing_requests = len(self._processing_request_ids) self._state.processed_requests += 1 self._state.successful_requests += 1 elif info.status in ("errored", "cancelled"): + info.timings.finalized = finalized if info.request_id in self._queued_request_ids: self._queued_request_ids.remove(info.request_id) self._state.queued_requests = len(self._queued_request_ids) diff --git a/src/guidellm/schemas/__init__.py b/src/guidellm/schemas/__init__.py index 42268f72..d230c204 100644 --- a/src/guidellm/schemas/__init__.py +++ b/src/guidellm/schemas/__init__.py @@ -9,6 +9,13 @@ from __future__ import annotations +from .base import ( + PydanticClassRegistryMixin, + ReloadableBaseModel, + StandardBaseDict, + StandardBaseModel, + StatusBreakdown, +) from .info import RequestInfo, RequestTimings from .request import ( GenerationRequest, @@ -16,16 +23,31 @@ GenerativeRequestType, UsageMetrics, ) +from .request_stats import GenerativeRequestStats from .response import GenerationResponse -from .stats import GenerativeRequestStats +from .statistics import ( + DistributionSummary, + FunctionObjT, + Percentiles, + StatusDistributionSummary, +) __all__ = [ + "DistributionSummary", + "FunctionObjT", "GenerationRequest", "GenerationRequestArguments", "GenerationResponse", "GenerativeRequestStats", "GenerativeRequestType", + "Percentiles", + "PydanticClassRegistryMixin", + "ReloadableBaseModel", "RequestInfo", "RequestTimings", + "StandardBaseDict", + "StandardBaseModel", + "StatusBreakdown", + "StatusDistributionSummary", "UsageMetrics", ] diff --git a/src/guidellm/utils/pydantic_utils.py b/src/guidellm/schemas/base.py similarity index 100% rename from src/guidellm/utils/pydantic_utils.py rename to src/guidellm/schemas/base.py diff --git a/src/guidellm/schemas/info.py b/src/guidellm/schemas/info.py index 4b5d188c..854756d4 100644 --- a/src/guidellm/schemas/info.py +++ b/src/guidellm/schemas/info.py @@ -14,7 +14,7 @@ from pydantic import Field, computed_field -from guidellm.utils import StandardBaseDict, StandardBaseModel +from guidellm.schemas.base import StandardBaseDict, StandardBaseModel __all__ = ["RequestInfo", "RequestTimings"] @@ -53,17 +53,23 @@ class RequestTimings(StandardBaseDict): default=None, description="Unix timestamp when the backend began processing the request", ) - first_iteration: float | None = Field( + first_request_iteration: float | None = Field( default=None, - description="Unix timestamp when the first iteration for a streaming began", ) - last_iteration: float | None = Field( + first_token_iteration: float | None = Field( default=None, - description="Unix timestamp when the last iteration for a streaming completed", ) - iterations: int | None = Field( + last_token_iteration: float | None = Field( default=None, - description="Total number of streaming update iterations performed", + ) + last_request_iteration: float | None = Field( + default=None, + ) + request_iterations: int = Field( + default=0, + ) + token_iterations: int = Field( + default=0, ) request_end: float | None = Field( default=None, @@ -78,6 +84,25 @@ class RequestTimings(StandardBaseDict): description="Unix timestamp when request was processed by the scheduler", ) + @property + def last_reported(self) -> float | None: + """ + Get the most recent timing measurement available. + + :return: The latest Unix timestamp from the timing fields, or None if none + """ + timing_fields = [ + self.queued, + self.dequeued, + self.scheduled_at, + self.resolve_start, + self.request_start, + self.request_end, + self.resolve_end, + ] + valid_timings = [field for field in timing_fields if field is not None] + return max(valid_timings) if valid_timings else None + class RequestInfo(StandardBaseModel): """ diff --git a/src/guidellm/schemas/request.py b/src/guidellm/schemas/request.py index 1f90d130..ed9a31f4 100644 --- a/src/guidellm/schemas/request.py +++ b/src/guidellm/schemas/request.py @@ -14,7 +14,7 @@ from pydantic import Field, computed_field -from guidellm.utils import StandardBaseDict, StandardBaseModel +from guidellm.schemas.base import StandardBaseDict, StandardBaseModel __all__ = [ "GenerationRequest", diff --git a/src/guidellm/schemas/request_stats.py b/src/guidellm/schemas/request_stats.py new file mode 100644 index 00000000..10db80be --- /dev/null +++ b/src/guidellm/schemas/request_stats.py @@ -0,0 +1,333 @@ +""" +Request statistics and metrics for generative AI benchmark analysis. + +Provides data structures for capturing and analyzing performance metrics from +generative AI workloads. The module contains request-level statistics including +token counts, latency measurements, and throughput calculations essential for +evaluating text generation benchmark performance. Computed properties enable +analysis of time-to-first-token, inter-token latency, and token generation rates. +""" + +from __future__ import annotations + +from typing import Literal + +import numpy as np +from pydantic import Field, computed_field + +from guidellm.schemas.base import StandardBaseDict +from guidellm.schemas.info import RequestInfo +from guidellm.schemas.request import GenerativeRequestType, UsageMetrics + +__all__ = ["GenerativeRequestStats"] + + +class GenerativeRequestStats(StandardBaseDict): + """ + Request statistics for generative AI text generation workloads. + + Captures comprehensive performance metrics for individual generative requests, + including token counts, timing measurements, and derived performance statistics. + Provides computed properties for latency analysis, throughput calculations, + and token generation metrics essential for benchmark evaluation. + + Example: + :: + stats = GenerativeRequestStats( + request_id="req_123", + request_type="text_completion", + info=request_info, + input_metrics=input_usage, + output_metrics=output_usage + ) + throughput = stats.output_tokens_per_second + """ + + type_: Literal["generative_request_stats"] = "generative_request_stats" + request_id: str = Field(description="Unique identifier for the request") + request_type: GenerativeRequestType | str = Field( + description="Type of generative request (text_completion or chat_completion)" + ) + request_args: str | None = Field( + default=None, description="Backend arguments used for this request" + ) + output: str | None = Field( + default=None, description="Generated text output from the request" + ) + info: RequestInfo = Field(description="Request metadata and timing information") + input_metrics: UsageMetrics = Field( + description="Token usage statistics for the input prompt" + ) + output_metrics: UsageMetrics = Field( + description="Token usage statistics for the generated output" + ) + + # Request stats + @computed_field # type: ignore[misc] + @property + def request_start_time(self) -> float | None: + """ + :return: Timestamp when the request started, or None if unavailable + """ + return self.info.timings.request_start or self.info.timings.resolve_start + + @computed_field # type: ignore[misc] + @property + def request_end_time(self) -> float: + """ + :return: Timestamp when the request ended, or None if unavailable + """ + if self.info.timings.resolve_end is None: + raise ValueError("resolve_end timings should be set but is None.") + + return self.info.timings.request_end or self.info.timings.resolve_end + + @computed_field # type: ignore[misc] + @property + def request_latency(self) -> float | None: + """ + End-to-end request processing latency in seconds. + + :return: Duration from request start to completion, or None if unavailable + """ + if not (start := self.info.timings.request_start) or not ( + end := self.info.timings.request_end + ): + return None + + return end - start + + # General token stats + @computed_field # type: ignore[misc] + @property + def prompt_tokens(self) -> int | None: + """ + :return: Number of tokens in the input prompt, or None if unavailable + """ + return self.input_metrics.text_tokens + + @computed_field # type: ignore[misc] + @property + def input_tokens(self) -> int | None: + """ + :return: Number of tokens in the input prompt, or None if unavailable + """ + return self.input_metrics.total_tokens + + @computed_field # type: ignore[misc] + @property + def output_tokens(self) -> int | None: + """ + :return: Number of tokens in the generated output, or None if unavailable + """ + return self.output_metrics.total_tokens + + @computed_field # type: ignore[misc] + @property + def total_tokens(self) -> int | None: + """ + :return: Sum of prompt and output tokens, or None if both unavailable + """ + input_tokens = self.input_metrics.total_tokens + output_tokens = self.output_metrics.total_tokens + + if input_tokens is None and output_tokens is None: + return None + + return (input_tokens or 0) + (output_tokens or 0) + + @computed_field # type: ignore[misc] + @property + def time_to_first_token_ms(self) -> float | None: + """ + :return: Time to first token generation in milliseconds, or None if unavailable + """ + if not (first_token := self.first_token_iteration) or not ( + start := self.info.timings.request_start + ): + return None + + return 1000 * (first_token - start) + + @computed_field # type: ignore[misc] + @property + def time_per_output_token_ms(self) -> float | None: + """ + Average time per output token in milliseconds including first token. + + :return: Average milliseconds per output token, or None if unavailable + """ + if ( + not (start := self.info.timings.request_start) + or not (last_token := self.last_token_iteration) + or not (output_tokens := self.output_tokens) + ): + return None + + return 1000 * (last_token - start) / output_tokens + + @computed_field # type: ignore[misc] + @property + def inter_token_latency_ms(self) -> float | None: + """ + Average inter-token latency in milliseconds excluding first token. + + :return: Average milliseconds between token generations, or None if unavailable + """ + if ( + not (first_token := self.first_token_iteration) + or not (last_token := self.last_token_iteration) + or not (output_tokens := self.output_tokens) + or output_tokens <= 1 + ): + return None + + return 1000 * (last_token - first_token) / (output_tokens - 1) + + @computed_field # type: ignore[misc] + @property + def tokens_per_second(self) -> float | None: + """ + :return: Total tokens per second throughput, or None if unavailable + """ + if not (latency := self.request_latency) or self.total_tokens is None: + return None + + return self.total_tokens / latency + + @computed_field # type: ignore[misc] + @property + def output_tokens_per_second(self) -> float | None: + """ + :return: Output token generation throughput, or None if unavailable + """ + if not (latency := self.request_latency) or self.output_tokens is None: + return None + + return self.output_tokens / latency + + @computed_field # type: ignore[misc] + @property + def iter_tokens_per_iteration(self) -> float | None: + """ + :return: Average tokens per iteration excluding first token, or None if + unavailable + """ + if ( + self.output_tokens is None + or self.output_tokens <= 1 + or self.token_iterations <= 1 + ): + return None + + return (self.output_tokens - 1.0) / ( + self.token_iterations - 1.0 + ) # subtract 1 for first token from the prompt, assume first iter is 1 token + + @computed_field # type: ignore[misc] + @property + def output_tokens_per_iteration(self) -> float | None: + """ + :return: Average output tokens per iteration, or None if unavailable + """ + if self.output_tokens is None or self.token_iterations < 1: + return None + + return self.output_tokens / self.token_iterations + + @property + def first_token_iteration(self) -> float | None: + """ + :return: Timestamp of first token generation, or None if unavailable + """ + return self.info.timings.first_token_iteration + + @property + def last_token_iteration(self) -> float | None: + """ + :return: Timestamp of last token generation, or None if unavailable + """ + return self.info.timings.last_token_iteration + + @property + def token_iterations(self) -> int: + """ + :return: Total number of token generation iterations + """ + return self.info.timings.token_iterations + + @property + def prompt_tokens_timing(self) -> tuple[float, float] | None: + """ + :return: Tuple of (timestamp, token_count) for prompt processing, or None + if unavailable + """ + if self.request_end_time is None: + # no end time, can't compute + return None + + return ( + self.first_token_iteration or self.request_end_time, + self.prompt_tokens or 0.0, + ) + + @property + def output_tokens_timings(self) -> list[tuple[float, float]]: + """ + :return: List of (timestamp, token_count) tuples for output token generations + """ + if self.request_end_time is None: + # no end time, can't compute + return [] + + if ( + self.first_token_iteration is None + or self.last_token_iteration is None + or self.token_iterations <= 1 + ): + # No iteration data, return single timing at end with all tokens + return [ + ( + self.last_token_iteration or self.request_end_time, + self.output_tokens or 0.0, + ) + ] + + # Return first token timing as 1 token plus per-iteration timings + return [ + (self.first_token_iteration, 1.0 * bool(self.output_tokens)) + ] + self.iter_tokens_timings + + @property + def iter_tokens_timings(self) -> list[tuple[float, float]]: + """ + :return: List of (timestamp, token_count) tuples for iterations excluding + first token + """ + if ( + self.first_token_iteration is None + or self.last_token_iteration is None + or (tok_per_iter := self.iter_tokens_per_iteration) is None + or self.token_iterations <= 1 + ): + return [] + + # evenly space the iterations since we don't have per-iteration timings + # / we don't know the individual token counts per iteration + iter_times = np.linspace( + self.first_token_iteration, + self.last_token_iteration, + num=self.token_iterations, + )[1:] # skip first iteration + + return [(iter_time, tok_per_iter) for iter_time in iter_times] + + @property + def total_tokens_timings(self) -> list[tuple[float, float]]: + """ + :return: List of (timestamp, token_count) tuples for all token generations + """ + prompt_timings = self.prompt_tokens_timing + output_timings = self.output_tokens_timings + + return ([prompt_timings] if prompt_timings else []) + output_timings diff --git a/src/guidellm/schemas/response.py b/src/guidellm/schemas/response.py index d4e53aa3..a02ae8ba 100644 --- a/src/guidellm/schemas/response.py +++ b/src/guidellm/schemas/response.py @@ -11,10 +11,10 @@ from pydantic import Field +from guidellm.schemas.base import StandardBaseModel from guidellm.schemas.info import RequestInfo from guidellm.schemas.request import GenerationRequest, UsageMetrics -from guidellm.schemas.stats import GenerativeRequestStats -from guidellm.utils import StandardBaseModel +from guidellm.schemas.request_stats import GenerativeRequestStats __all__ = ["GenerationResponse"] diff --git a/src/guidellm/schemas/statistics.py b/src/guidellm/schemas/statistics.py new file mode 100644 index 00000000..bbfe666d --- /dev/null +++ b/src/guidellm/schemas/statistics.py @@ -0,0 +1,1002 @@ +""" +Statistical distribution analysis and summary calculations for benchmark metrics. + +Provides comprehensive statistical analysis tools including percentile calculations, +summary statistics, and status-based distributions. Supports value distributions, +time-based rate and concurrency distributions with weighted sampling, and probability +density functions for analyzing benchmark performance metrics and request patterns +across different status categories (successful, incomplete, errored). +""" + +from __future__ import annotations + +import math +from collections.abc import Callable, Sequence +from typing import Literal, TypeVar + +import numpy as np +from pydantic import Field + +from guidellm.schemas.base import StandardBaseModel, StatusBreakdown + +__all__ = [ + "DistributionSummary", + "FunctionObjT", + "Percentiles", + "StatusDistributionSummary", +] + +FunctionObjT = TypeVar("FunctionObjT") + + +class Percentiles(StandardBaseModel): + """ + Standard percentile values for probability distributions. + + Captures key percentile points from 0.1th to 99.9th percentile for comprehensive + distribution analysis, enabling assessment of central tendency, spread, and tail + behavior in benchmark metrics. + """ + + p001: float = Field(description="0.1th percentile value") + p01: float = Field(description="1st percentile value") + p05: float = Field(description="5th percentile value") + p10: float = Field(description="10th percentile value") + p25: float = Field(description="25th percentile value") + p50: float = Field(description="50th percentile (median) value") + p75: float = Field(description="75th percentile value") + p90: float = Field(description="90th percentile value") + p95: float = Field(description="95th percentile value") + p99: float = Field(description="99th percentile value") + p999: float = Field(description="99.9th percentile value") + + @classmethod + def from_pdf( + cls, pdf: np.ndarray, epsilon: float = 1e-6, validate: bool = True + ) -> Percentiles: + """ + Create percentiles from a probability density function. + + :param pdf: 2D array (N, 2) with values in column 0 and probabilities in + column 1 + :param epsilon: Tolerance for probability sum validation + :param validate: Whether to validate probabilities sum to 1 and are + non-negative + :return: Percentiles object with computed values + :raises ValueError: If PDF shape is invalid, probabilities are negative, + or probabilities don't sum to 1 + """ + expected_shape = (None, 2) + + if len(pdf.shape) != len(expected_shape) or pdf.shape[1] != expected_shape[1]: + raise ValueError( + "PDF must be a 2D array of shape (N, 2) where first column is values " + f"and second column is probabilities. Got {pdf.shape} instead." + ) + + percentile_probs = { + "p001": 0.001, + "p01": 0.01, + "p05": 0.05, + "p10": 0.1, + "p25": 0.25, + "p50": 0.5, + "p75": 0.75, + "p90": 0.9, + "p95": 0.95, + "p99": 0.99, + "p999": 0.999, + } + + if pdf.shape[0] == 0: + return Percentiles(**dict.fromkeys(percentile_probs.keys(), 0.0)) + + probabilities = pdf[:, 1] + + if validate: + if np.any(probabilities < 0): + raise ValueError("Probabilities must be non-negative.") + + prob_sum = np.sum(probabilities) + if abs(prob_sum - 1.0) > epsilon: + raise ValueError(f"Probabilities must sum to 1, got {prob_sum}.") + + cdf_probs = np.cumsum(probabilities) + + return Percentiles( + **{ + key: pdf[np.searchsorted(cdf_probs, value, side="left"), 0].item() + for key, value in percentile_probs.items() + } + ) + + +class DistributionSummary(StandardBaseModel): + """ + Comprehensive statistical summary of a probability distribution. + + Captures central tendency (mean, median, mode), spread (variance, std_dev), + extrema (min, max), and percentile information with optional probability density + function. Supports creation from raw values, PDFs, or time-based event data for + rate and concurrency analysis in benchmark metrics. + """ + + mean: float = Field(description="Mean/average value") + median: float = Field(description="Median (50th percentile) value") + mode: float = Field(description="Mode (most probable) value") + variance: float = Field(description="Variance of the distribution") + std_dev: float = Field(description="Standard deviation") + min: float = Field(description="Minimum value") + max: float = Field(description="Maximum value") + count: int = Field(description="Number of observations") + total_sum: float = Field(description="Sum of all values") + percentiles: Percentiles = Field(description="Standard percentile values") + pdf: list[tuple[float, float]] | None = Field( + description="Probability density function as (value, probability) pairs", + default=None, + ) + + @classmethod + def from_pdf( + cls, + pdf: np.ndarray, + count: int | None = None, + include_pdf: bool | int = False, + epsilon: float = 1e-6, + validate: bool = True, + ) -> DistributionSummary: + """ + Create distribution summary from a probability density function. + + :param pdf: 2D array (N, 2) with values in column 0 and probabilities in + column 1 + :param count: Number of original observations; defaults to PDF length + :param include_pdf: Whether to include PDF; True for full, int for sampled size + :param epsilon: Tolerance for probability validation + :param validate: Whether to validate probabilities sum to 1 and are non-negative + :return: Complete distribution summary with statistics + :raises ValueError: If PDF shape is invalid or probabilities are invalid + """ + expected_shape = (None, 2) + + if len(pdf.shape) != len(expected_shape) or pdf.shape[1] != expected_shape[1]: + raise ValueError( + "PDF must be a 2D array of shape (N, 2) where first column is values " + f"and second column is probabilities. Got {pdf.shape} instead." + ) + + if pdf.shape[0] == 0: + return DistributionSummary( + mean=0.0, + median=0.0, + mode=0.0, + variance=0.0, + std_dev=0.0, + min=0.0, + max=0.0, + count=0 if count is None else count, + total_sum=0.0, + percentiles=Percentiles.from_pdf(pdf, epsilon=epsilon), + pdf=None if include_pdf is False else [], + ) + + # Calculate stats + values = pdf[:, 0] + probabilities = pdf[:, 1] + + if validate: + # Fail if probabilities don't sum to 1 or are negative + if np.any(probabilities < 0): + raise ValueError("Probabilities must be non-negative.") + + prob_sum = np.sum(probabilities) + if not np.isclose(prob_sum, 1.0, atol=epsilon): + raise ValueError(f"Probabilities must sum to 1.0 (sum={prob_sum}).") + + # Fail if values are not sorted + if not np.all(values[:-1] <= values[1:]): + raise ValueError("Values in PDF must be sorted in ascending order.") + + percentiles = Percentiles.from_pdf(pdf, epsilon=epsilon, validate=False) + median = percentiles.p50 + mean = np.sum(values * probabilities).item() + mode = values[np.argmax(probabilities)].item() + variance = np.sum((values - mean) ** 2 * probabilities).item() + std_dev = math.sqrt(variance) + minimum = values[0].item() + maximum = values[-1].item() + + if count is None: + count = len(pdf) + + total_sum = mean * count + + if include_pdf is False: + sampled_pdf = None + elif include_pdf is True: + sampled_pdf = pdf.tolist() + else: + sampled_pdf = [] + + return DistributionSummary( + mean=mean, + median=median, + mode=mode, + variance=variance, + std_dev=std_dev, + min=minimum, + max=maximum, + count=count, + total_sum=total_sum, + percentiles=percentiles, + pdf=sampled_pdf, + ) + + @classmethod + def from_values( + cls, + values: Sequence[float | tuple[float, float]] | np.ndarray, + count: int | None = None, + include_pdf: bool | int = False, + epsilon: float = 1e-6, + ) -> DistributionSummary: + """ + Create distribution summary from raw values with optional weights. + + :param values: Values or (value, weight) tuples, or numpy array + :param count: Number of original observations; defaults to sum of weights + :param include_pdf: Whether to include PDF; True for full, int for sampled size + :param epsilon: Tolerance for probability validation + :return: Distribution summary computed from the values + :raises ValueError: If total weight is zero or invalid + """ + np_values = cls._to_weighted_ndarray(values, num_values_per_item=2) + + if np_values.shape[0] == 0: + return DistributionSummary.from_pdf( + pdf=np.empty((0, 2)), count=0, include_pdf=include_pdf, epsilon=epsilon + ) + + if count is None: + count = round(np.sum(np_values[:, 1]).item()) + + # Sort values and weights by values + sort_ind = np.argsort(np_values[:, 0]) + sorted_values = np_values[sort_ind, 0] + sorted_weights = np_values[sort_ind, 1] + + # Combine any duplicate values by summing their weights + unique_values, inverse_indices = np.unique(sorted_values, return_inverse=True) + combined_weights = np.zeros_like(unique_values, dtype=float) + np.add.at(combined_weights, inverse_indices, sorted_weights) + + # Remove any values with zero weight + nonzero_mask = combined_weights > 0 + final_values = unique_values[nonzero_mask] + final_weights = combined_weights[nonzero_mask] + + # Create PDF by normalizing weights and stacking + total_weight = np.sum(final_weights) + if total_weight <= epsilon: + # No valid weights to create PDF, overwrite to uniform distribution + final_weights = np.ones_like(final_values) + total_weight = np.sum(final_weights) + + probabilities = final_weights / total_weight + pdf = np.column_stack((final_values, probabilities)) + + return DistributionSummary.from_pdf( + pdf=pdf, + count=count, + include_pdf=include_pdf, + epsilon=epsilon, + validate=False, + ) + + @classmethod + def rate_distribution_from_timings( + cls, + event_times: Sequence[float | tuple[float, float]] | np.ndarray, + start_time: float | None = None, + end_time: float | None = None, + threshold: float | None = 1e-4, # 1/10th of a millisecond + include_pdf: bool | int = False, + epsilon: float = 1e-6, + ) -> DistributionSummary: + """ + Create rate distribution from event timestamps. + + Computes event rates over time intervals weighted by interval duration for + analyzing request throughput patterns. + + :param event_times: Event timestamps or (timestamp, weight) tuples + :param start_time: Analysis window start; filters earlier events + :param end_time: Analysis window end; filters later events + :param threshold: Time threshold for merging nearby events; 1/10th millisecond + :param include_pdf: Whether to include PDF; True for full, int for sampled size + :param epsilon: Tolerance for probability validation + :return: Distribution summary of event rates over time + """ + weighted_times = cls._to_weighted_ndarray(event_times, num_values_per_item=2) + + if start_time is not None: + # Filter out any times before start, insert start time with 0 weight + weighted_times = np.insert( + weighted_times[weighted_times[:, 0] >= start_time], + 0, + [start_time, 0.0], + axis=0, + ) + + if end_time is not None: + # Filter out any times after end, insert end time with 0 weight + weighted_times = np.append( + weighted_times[weighted_times[:, 0] <= end_time], + [[end_time, 0.0]], + axis=0, + ) + + # Sort by time for merging, merge any times within threshold + sort_ind = np.argsort(weighted_times[:, 0]) + weighted_times = weighted_times[sort_ind] + weighted_times = cls._merge_sorted_times_with_weights(weighted_times, threshold) + + if len(weighted_times) <= 1: + # No data to calculate rates from (need at least two times) + return cls.from_values( + [], + count=len(weighted_times), + include_pdf=include_pdf, + epsilon=epsilon, + ) + + times = weighted_times[:, 0] + occurrences = weighted_times[:, 1] + + # Calculate local duration for each event: ((times[i+1] - times[i-1])) / 2 + midpoints = (times[1:] + times[:-1]) / 2 + durations = np.empty_like(times) + durations[0] = midpoints[0] - times[0] + durations[1:-1] = midpoints[1:] - midpoints[:-1] + durations[-1] = np.clip(times[-1] - midpoints[-1], epsilon, None) + + # Calculate rate at each interval: occurences[i] / duration[i] + rates = occurrences / durations + count = round(np.sum(occurrences).item()) + + return cls.from_values( + np.column_stack((rates, durations)), + count=count, + include_pdf=include_pdf, + epsilon=epsilon, + ) + + @classmethod + def concurrency_distribution_from_timings( + cls, + event_intervals: ( + Sequence[tuple[float, float] | tuple[float, float, float]] | np.ndarray + ), + start_time: float | None = None, + end_time: float | None = None, + threshold: float | None = 1e-4, # 1/10th of a millisecond + include_pdf: bool | int = False, + epsilon: float = 1e-6, + ) -> DistributionSummary: + """ + Create concurrency distribution from event time intervals. + + Tracks overlapping events to compute concurrency levels over time for analyzing + request processing patterns and resource utilization. + + :param event_intervals: Event (start, end) or (start, end, weight) tuples + :param start_time: Analysis window start + :param end_time: Analysis window end + :param threshold: Time threshold for merging nearby transitions; + 1/10th millisecond + :param include_pdf: Whether to include PDF; True for full, int for sampled size + :param epsilon: Tolerance for probability validation + :return: Distribution summary of concurrency levels over time + """ + weighted_intervals = cls._to_weighted_ndarray( + event_intervals, num_values_per_item=3 + ) + + # If start_time, filter any intervals that end before start_time + if start_time is not None: + keep_mask = weighted_intervals[:, 1] >= start_time + weighted_intervals = weighted_intervals[keep_mask] + + # If end_time, filter any intervals that start after end_time + if end_time is not None: + keep_mask = weighted_intervals[:, 0] <= end_time + weighted_intervals = weighted_intervals[keep_mask] + + count = len(weighted_intervals) + + # Convert to concurrency changes at each time + add_occurences = ( + np.stack( + ( + weighted_intervals[:, 0], + weighted_intervals[:, 2], + ), + axis=1, + ) + if len(weighted_intervals) > 0 + else np.empty((0, 2)) + ) + remove_occurences = ( + np.stack( + ( + weighted_intervals[:, 1], + -1 * weighted_intervals[:, 2], + ), + axis=1, + ) + if len(weighted_intervals) > 0 + else np.empty((0, 2)) + ) + + # Combine add and remove occurences into weighted times + weighted_times = np.vstack((add_occurences, remove_occurences)) + + # Sort by the times and merge any times within threshold + weighted_times = weighted_times[np.argsort(weighted_times[:, 0])] + weighted_times = cls._merge_sorted_times_with_weights(weighted_times, threshold) + + # If start_time, ensure included (if any before, add final concurrency at start) + if start_time is not None and len(weighted_times) > 0: + start_ind = np.searchsorted(weighted_times[:, 0], start_time, side="left") + prior_delta = ( + np.sum(weighted_times[:start_ind, 1]) if start_ind > 0 else 0.0 + ) + weighted_times = np.insert( + weighted_times[start_ind:], 0, [start_time, prior_delta], axis=0 + ) + + # If end_time, ensure included (if any after, filter out) + if end_time is not None and len(weighted_times) > 0: + end_ind = np.searchsorted(weighted_times[:, 0], end_time, side="right") + weighted_times = np.append( + weighted_times[:end_ind], [[end_time, 0.0]], axis=0 + ) + + # Calculate concurrency from cumulative sum of changes over time + concurrencies = np.clip(np.cumsum(weighted_times[:, 1]), 0, None) + + if len(concurrencies) <= 1: + # No data to calculate concurrency from + return cls.from_values( + [] if count == 0 else [concurrencies[0].item()], + include_pdf=include_pdf, + epsilon=epsilon, + ) + + # Calculate durations equal to times[i+1] - times[i] + # The last concurrency level is not used since no following time point + durations = np.clip(np.diff(weighted_times[:, 0]), 0, None) + values = np.column_stack((concurrencies[:-1], durations)) + + return ( + cls.from_values( + values, + count=count, + include_pdf=include_pdf, + epsilon=epsilon, + ) + if np.any(durations > 0) + else cls.from_values( + [], + count=count, + include_pdf=include_pdf, + epsilon=epsilon, + ) + ) + + @classmethod + def _to_weighted_ndarray( + cls, + inputs: ( + Sequence[float | tuple[float, float] | tuple[float, float, float]] + | np.ndarray + ), + num_values_per_item: Literal[2, 3], + ) -> np.ndarray: + if not isinstance(inputs, np.ndarray): + # Convert list to structured numpy array with dims (N, num_dimensions) + # Fill in missing weights with 1.0 + return cls._sequence_to_weighted_ndarray(inputs, num_values_per_item) + + if len(inputs.shape) == 1: + # 1D array: reshape to (N, 1) and add weights column + inputs = inputs.reshape(-1, 1) + weights = np.ones((inputs.shape[0], 1), dtype=float) + + return ( + np.hstack((inputs, weights)) + if num_values_per_item == 2 # noqa: PLR2004 + else np.hstack((inputs, inputs, weights)) + ) + + if len(inputs.shape) == 2 and inputs.shape[1] == num_values_per_item - 1: # noqa: PLR2004 + # Add weights column of 1.0 + weights = np.ones((inputs.shape[0], 1), dtype=float) + + return np.hstack((inputs, weights)) + + if len(inputs.shape) == 2 and inputs.shape[1] == num_values_per_item: # noqa: PLR2004 + return inputs + + raise ValueError( + "inputs must be a numpy array of shape (N,), " + f"(N, {num_values_per_item - 1}), or (N, {num_values_per_item}). " + f"Got shape {inputs.shape}." + ) + + @classmethod + def _sequence_to_weighted_ndarray( + cls, + inputs: Sequence[float | tuple[float, float] | tuple[float, float, float]], + num_values_per_item: Literal[2, 3], + ) -> np.ndarray: + ndarray = np.empty((len(inputs), num_values_per_item), dtype=float) + scalar_types: tuple[type, ...] = (int, float, np.integer, np.floating) + + for ind, val in enumerate(inputs): + if isinstance(val, scalar_types): + ndarray[ind, :] = ( + (val, 1.0) if num_values_per_item == 2 else (val, val, 1.0) # noqa: PLR2004 + ) + elif isinstance(val, tuple) and len(val) == num_values_per_item: + ndarray[ind, :] = val + elif isinstance(val, tuple) and len(val) == num_values_per_item - 1: + ndarray[ind, :] = ( + (val[0], 1.0) if num_values_per_item == 2 else (val[0], val[1], 1.0) # noqa: PLR2004 + ) + else: + raise ValueError( + "Each item must be a float or a tuple of " + f"{num_values_per_item} or {num_values_per_item - 1} " + "elements." + ) + + return ndarray + + @classmethod + def _merge_sorted_times_with_weights( + cls, weighted_times: np.ndarray, threshold: float | None + ) -> np.ndarray: + # First remove any exact duplicate times and sum their weights + unique_times, inverse = np.unique(weighted_times[:, 0], return_inverse=True) + unique_weights = np.zeros_like(unique_times, dtype=float) + np.add.at(unique_weights, inverse, weighted_times[:, 1]) + weighted_times = np.column_stack((unique_times, unique_weights)) + + if threshold is None or threshold <= 0.0: + return weighted_times + + # Loop to merge times within threshold until no more merges possible + # (loop due to possible overlapping merge groups) + while weighted_times.shape[0] > 1: + times = weighted_times[:, 0] + weights = weighted_times[:, 1] + + # Find diffs between consecutive times, create mask for within-threshold + diffs = np.diff(times) + within = diffs <= threshold + if not np.any(within): + break + + # Start indices are marked by the transition from 0 to 1 in the mask + # End indices found by searching for last time within threshold from start + starts = np.where(np.diff(np.insert(within.astype(int), 0, 0)) == 1)[0] + start_end_times = times[starts] + threshold + ends = np.searchsorted(times, start_end_times, side="right") - 1 + + # Collapse overlapping or chained merge groups + if len(starts) > 1: + valid_mask = np.concatenate([[True], starts[1:] > ends[:-1]]) + starts, ends = starts[valid_mask], ends[valid_mask] + + # Update weights at start indices to sum of merged weights + cumsum = np.concatenate(([0.0], np.cumsum(weights))) + weighted_times[starts, 1] = cumsum[ends + 1] - cumsum[starts] + + # Calculate vectorized mask for removing merged entries + merged_events = np.zeros(len(weighted_times) + 1, dtype=int) + np.add.at(merged_events, starts, 1) + np.add.at(merged_events, ends + 1, -1) + remove_mask = np.cumsum(merged_events[:-1]) > 0 + remove_mask[starts] = False # Keep start indices + + # Remove merged entries, update weighted_times + weights = weights[~remove_mask] + times = times[~remove_mask] + weighted_times = np.column_stack((times, weights)) + + return weighted_times + + +class StatusDistributionSummary( + StatusBreakdown[ + DistributionSummary, + DistributionSummary, + DistributionSummary, + DistributionSummary, + ] +): + """ + Distribution summaries broken down by request status categories. + + Provides separate statistical analysis for successful, incomplete, and errored + requests with total aggregate statistics. Enables status-aware performance analysis + and SLO validation across different request outcomes in benchmark results. + """ + + @property + def count(self) -> int: + """ + :return: Total count of samples across all status categories + """ + return self.total.count + + @property + def total_sum(self) -> float: + """ + :return: Total sum of values across all status categories + """ + return self.total.total_sum + + @classmethod + def from_values( + cls, + successful: Sequence[float | tuple[float, float]] | np.ndarray, + incomplete: Sequence[float | tuple[float, float]] | np.ndarray, + errored: Sequence[float | tuple[float, float]] | np.ndarray, + include_pdf: bool | int = False, + epsilon: float = 1e-6, + ) -> StatusDistributionSummary: + """ + Create status-broken-down distribution from values by status category. + + :param successful: Values or (value, weight) tuples for successful requests + :param incomplete: Values or (value, weight) tuples for incomplete requests + :param errored: Values or (value, weight) tuples for errored requests + :param include_pdf: Whether to include PDF; True for full, int for sampled size + :param epsilon: Tolerance for probability validation + :return: Status breakdown of distribution summaries + """ + total, successful_arr, incomplete_arr, errored_arr = cls._combine_status_arrays( + successful, incomplete, errored, num_values_per_item=2 + ) + + return StatusDistributionSummary( + total=DistributionSummary.from_values( + total, include_pdf=include_pdf, epsilon=epsilon + ), + successful=DistributionSummary.from_values( + successful_arr, include_pdf=include_pdf, epsilon=epsilon + ), + incomplete=DistributionSummary.from_values( + incomplete_arr, include_pdf=include_pdf, epsilon=epsilon + ), + errored=DistributionSummary.from_values( + errored_arr, include_pdf=include_pdf, epsilon=epsilon + ), + ) + + @classmethod + def from_values_function( + cls, + function: Callable[ + [FunctionObjT], + float | tuple[float, float] | Sequence[float | tuple[float, float]] | None, + ], + successful: Sequence[FunctionObjT], + incomplete: Sequence[FunctionObjT], + errored: Sequence[FunctionObjT], + include_pdf: bool | int = False, + epsilon: float = 1e-6, + ) -> StatusDistributionSummary: + """ + Create distribution summary by extracting values from objects via function. + + :param function: Function to extract value(s) from each object + :param successful: Successful request objects + :param incomplete: Incomplete request objects + :param errored: Errored request objects + :param include_pdf: Whether to include PDF; True for full, int for sampled size + :param epsilon: Tolerance for probability validation + :return: Status breakdown of distribution summaries + """ + + def _extract_values( + _objs: Sequence[FunctionObjT], + ) -> Sequence[float | tuple[float, float]]: + _outputs: list[float | tuple[float, float]] = [] + for _obj in _objs: + if (_result := function(_obj)) is None: + continue + if isinstance(_result, Sequence) and not isinstance(_result, tuple): + _outputs.extend(_result) + else: + _outputs.append(_result) + return _outputs + + return cls.from_values( + successful=_extract_values(successful), + incomplete=_extract_values(incomplete), + errored=_extract_values(errored), + include_pdf=include_pdf, + epsilon=epsilon, + ) + + @classmethod + def rate_distribution_from_timings( + cls, + successful: Sequence[float | tuple[float, float]] | np.ndarray, + incomplete: Sequence[float | tuple[float, float]] | np.ndarray, + errored: Sequence[float | tuple[float, float]] | np.ndarray, + start_time: float | None = None, + end_time: float | None = None, + threshold: float | None = 1e-4, + include_pdf: bool | int = False, + epsilon: float = 1e-6, + ) -> StatusDistributionSummary: + """ + Create status-broken-down rate distribution from event timestamps. + + :param successful: Timestamps for successful request events + :param incomplete: Timestamps for incomplete request events + :param errored: Timestamps for errored request events + :param start_time: Analysis window start + :param end_time: Analysis window end + :param threshold: Time threshold for merging nearby events + :param include_pdf: Whether to include PDF; True for full, int for sampled size + :param epsilon: Tolerance for probability validation + :return: Status breakdown of rate distribution summaries + """ + total, successful_arr, incomplete_arr, errored_arr = cls._combine_status_arrays( + successful, incomplete, errored, num_values_per_item=2 + ) + + return StatusDistributionSummary( + total=DistributionSummary.rate_distribution_from_timings( + total, + start_time=start_time, + end_time=end_time, + threshold=threshold, + include_pdf=include_pdf, + epsilon=epsilon, + ), + successful=DistributionSummary.rate_distribution_from_timings( + successful_arr, + start_time=start_time, + end_time=end_time, + threshold=threshold, + include_pdf=include_pdf, + epsilon=epsilon, + ), + incomplete=DistributionSummary.rate_distribution_from_timings( + incomplete_arr, + start_time=start_time, + end_time=end_time, + threshold=threshold, + include_pdf=include_pdf, + epsilon=epsilon, + ), + errored=DistributionSummary.rate_distribution_from_timings( + errored_arr, + start_time=start_time, + end_time=end_time, + threshold=threshold, + include_pdf=include_pdf, + epsilon=epsilon, + ), + ) + + @classmethod + def rate_distribution_from_timings_function( + cls, + function: Callable[ + [FunctionObjT], + float | tuple[float, float] | Sequence[float | tuple[float, float]] | None, + ], + successful: Sequence[FunctionObjT], + incomplete: Sequence[FunctionObjT], + errored: Sequence[FunctionObjT], + start_time: float | None = None, + end_time: float | None = None, + threshold: float | None = 1e-4, + include_pdf: bool | int = False, + epsilon: float = 1e-6, + ) -> StatusDistributionSummary: + """ + Create rate distribution by extracting timestamps from objects via function. + + :param function: Function to extract timestamp(s) from each object + :param successful: Successful request objects + :param incomplete: Incomplete request objects + :param errored: Errored request objects + :param start_time: Analysis window start + :param end_time: Analysis window end + :param threshold: Time threshold for merging nearby events + :param include_pdf: Whether to include PDF; True for full, int for sampled size + :param epsilon: Tolerance for probability validation + :return: Status breakdown of rate distribution summaries + """ + + def _extract_values( + _objs: Sequence[FunctionObjT], + ) -> Sequence[float | tuple[float, float]]: + _outputs: list[float | tuple[float, float]] = [] + for _obj in _objs: + if (_result := function(_obj)) is None: + continue + if isinstance(_result, Sequence) and not isinstance(_result, tuple): + _outputs.extend(_result) + else: + _outputs.append(_result) + return _outputs + + return cls.rate_distribution_from_timings( + successful=_extract_values(successful), + incomplete=_extract_values(incomplete), + errored=_extract_values(errored), + start_time=start_time, + end_time=end_time, + threshold=threshold, + include_pdf=include_pdf, + epsilon=epsilon, + ) + + @classmethod + def concurrency_distribution_from_timings( + cls, + successful: Sequence[tuple[float, float] | tuple[float, float, float]] + | np.ndarray, + incomplete: Sequence[tuple[float, float] | tuple[float, float, float]] + | np.ndarray, + errored: Sequence[tuple[float, float] | tuple[float, float, float]] + | np.ndarray, + start_time: float | None = None, + end_time: float | None = None, + threshold: float | None = 1e-4, + include_pdf: bool | int = False, + epsilon: float = 1e-6, + ) -> StatusDistributionSummary: + """ + Create status-broken-down concurrency distribution from event intervals. + + :param successful: Event intervals for successful requests + :param incomplete: Event intervals for incomplete requests + :param errored: Event intervals for errored requests + :param start_time: Analysis window start + :param end_time: Analysis window end + :param threshold: Time threshold for merging nearby transitions + :param include_pdf: Whether to include PDF; True for full, int for sampled size + :param epsilon: Tolerance for probability validation + :return: Status breakdown of concurrency distribution summaries + """ + total, successful_arr, incomplete_arr, errored_arr = cls._combine_status_arrays( + successful, incomplete, errored, num_values_per_item=3 + ) + + return StatusDistributionSummary( + total=DistributionSummary.concurrency_distribution_from_timings( + total, + start_time=start_time, + end_time=end_time, + threshold=threshold, + include_pdf=include_pdf, + epsilon=epsilon, + ), + successful=DistributionSummary.concurrency_distribution_from_timings( + successful_arr, + start_time=start_time, + end_time=end_time, + threshold=threshold, + include_pdf=include_pdf, + epsilon=epsilon, + ), + incomplete=DistributionSummary.concurrency_distribution_from_timings( + incomplete_arr, + start_time=start_time, + end_time=end_time, + threshold=threshold, + include_pdf=include_pdf, + epsilon=epsilon, + ), + errored=DistributionSummary.concurrency_distribution_from_timings( + errored_arr, + start_time=start_time, + end_time=end_time, + threshold=threshold, + include_pdf=include_pdf, + epsilon=epsilon, + ), + ) + + @classmethod + def concurrency_distribution_from_timings_function( + cls, + function: Callable[ + [FunctionObjT], + tuple[float, float] + | tuple[float, float, float] + | Sequence[tuple[float, float] | tuple[float, float, float]] + | None, + ], + successful: Sequence[FunctionObjT], + incomplete: Sequence[FunctionObjT], + errored: Sequence[FunctionObjT], + start_time: float | None = None, + end_time: float | None = None, + threshold: float | None = 1e-4, + include_pdf: bool | int = False, + epsilon: float = 1e-6, + ) -> StatusDistributionSummary: + """ + Create concurrency distribution by extracting intervals from objects. + + :param function: Function to extract time interval(s) from each object + :param successful: Successful request objects + :param incomplete: Incomplete request objects + :param errored: Errored request objects + :param start_time: Analysis window start + :param end_time: Analysis window end + :param threshold: Time threshold for merging nearby transitions + :param include_pdf: Whether to include PDF; True for full, int for sampled size + :param epsilon: Tolerance for probability validation + :return: Status breakdown of concurrency distribution summaries + """ + + def _extract_values( + _objs: Sequence[FunctionObjT], + ) -> Sequence[tuple[float, float] | tuple[float, float, float]]: + _outputs: list[tuple[float, float] | tuple[float, float, float]] = [] + for _obj in _objs: + if (_result := function(_obj)) is None: + continue + if isinstance(_result, Sequence) and not isinstance(_result, tuple): + _outputs.extend(_result) + else: + _outputs.append(_result) + return _outputs + + return cls.concurrency_distribution_from_timings( + successful=_extract_values(successful), + incomplete=_extract_values(incomplete), + errored=_extract_values(errored), + start_time=start_time, + end_time=end_time, + threshold=threshold, + include_pdf=include_pdf, + epsilon=epsilon, + ) + + @classmethod + def _combine_status_arrays( + cls, + successful: Sequence[float | tuple[float, float] | tuple[float, float, float]] + | np.ndarray, + incomplete: Sequence[float | tuple[float, float] | tuple[float, float, float]] + | np.ndarray, + errored: Sequence[float | tuple[float, float] | tuple[float, float, float]] + | np.ndarray, + num_values_per_item: Literal[2, 3], + ) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + successful_array = DistributionSummary._to_weighted_ndarray( # noqa: SLF001 + successful, num_values_per_item=num_values_per_item + ) + incomplete_array = DistributionSummary._to_weighted_ndarray( # noqa: SLF001 + incomplete, num_values_per_item=num_values_per_item + ) + errored_array = DistributionSummary._to_weighted_ndarray( # noqa: SLF001 + errored, num_values_per_item=num_values_per_item + ) + total_array = np.concatenate( + (successful_array, incomplete_array, errored_array), axis=0 + ) + return total_array, successful_array, incomplete_array, errored_array diff --git a/src/guidellm/schemas/stats.py b/src/guidellm/schemas/stats.py deleted file mode 100644 index 67f1d26c..00000000 --- a/src/guidellm/schemas/stats.py +++ /dev/null @@ -1,228 +0,0 @@ -""" -Request statistics and metrics for generative AI benchmark analysis. - -Provides data structures for capturing and analyzing performance metrics from -generative AI workloads. Contains request-level statistics including token counts, -latency measurements, and throughput calculations for text generation benchmarks. -""" - -from __future__ import annotations - -from typing import Literal - -from pydantic import Field, computed_field - -from guidellm.schemas.info import RequestInfo -from guidellm.schemas.request import GenerativeRequestType, UsageMetrics -from guidellm.utils import StandardBaseDict - -__all__ = ["GenerativeRequestStats"] - - -class GenerativeRequestStats(StandardBaseDict): - """ - Request statistics for generative AI text generation workloads. - - Captures comprehensive performance metrics for individual generative requests, - including token counts, timing measurements, and derived performance statistics. - Provides computed properties for latency analysis, throughput calculations, - and token generation metrics essential for benchmark evaluation. - - Example: - :: - stats = GenerativeRequestStats( - request_id="req_123", - request_type="text_completion", - info=request_info, - input_metrics=input_usage, - output_metrics=output_usage - ) - throughput = stats.output_tokens_per_second - """ - - type_: Literal["generative_request_stats"] = "generative_request_stats" - request_id: str = Field(description="Unique identifier for the request") - request_type: GenerativeRequestType | str = Field( - description="Type of generative request: text or chat completion" - ) - request_args: str | None = Field( - default=None, description="Arguments passed to the backend for this request" - ) - output: str | None = Field( - description="Generated text output, if request completed successfully" - ) - info: RequestInfo = Field( - description="Metadata and timing information for the request" - ) - input_metrics: UsageMetrics = Field( - description="Usage statistics for the input prompt" - ) - output_metrics: UsageMetrics = Field( - description="Usage statistics for the generated output" - ) - - # Request stats - @computed_field # type: ignore[misc] - @property - def request_latency(self) -> float | None: - """ - End-to-end request processing latency in seconds. - - :return: Duration from request start to completion, or None if unavailable. - """ - if not self.info.timings.request_end or not self.info.timings.request_start: - return None - - return self.info.timings.request_end - self.info.timings.request_start - - # General token stats - @computed_field # type: ignore[misc] - @property - def prompt_tokens(self) -> int | None: - """ - Number of tokens in the input prompt. - - :return: Input prompt token count, or None if unavailable. - """ - return self.input_metrics.text_tokens - - @computed_field # type: ignore[misc] - @property - def input_tokens(self) -> int | None: - """ - Number of tokens in the input prompt. - - :return: Input prompt token count, or None if unavailable. - """ - return self.input_metrics.total_tokens - - @computed_field # type: ignore[misc] - @property - def output_tokens(self) -> int | None: - """ - Number of tokens in the generated output. - - :return: Generated output token count, or None if unavailable. - """ - return self.output_metrics.total_tokens - - @computed_field # type: ignore[misc] - @property - def total_tokens(self) -> int | None: - """ - Total token count including prompt and output tokens. - - :return: Sum of prompt and output tokens, or None if either is unavailable. - """ - input_tokens = self.input_metrics.total_tokens - output_tokens = self.output_metrics.total_tokens - - if input_tokens is None and output_tokens is None: - return None - - return (input_tokens or 0) + (output_tokens or 0) - - @computed_field # type: ignore[misc] - @property - def time_to_first_token_ms(self) -> float | None: - """ - Time to first token generation in milliseconds. - - :return: Latency from request start to first token, or None if unavailable. - """ - if ( - not self.info.timings.first_iteration - or not self.info.timings.request_start - or self.info.timings.first_iteration == self.info.timings.last_iteration - ): - return None - - return 1000 * ( - self.info.timings.first_iteration - self.info.timings.request_start - ) - - @computed_field # type: ignore[misc] - @property - def time_per_output_token_ms(self) -> float | None: - """ - Average time per output token in milliseconds. - - Includes time for first token and all subsequent tokens. - - :return: Average milliseconds per output token, or None if unavailable. - """ - if ( - not self.info.timings.request_start - or not self.info.timings.last_iteration - or not self.output_metrics.total_tokens - ): - return None - - return ( - 1000 - * (self.info.timings.last_iteration - self.info.timings.request_start) - / self.output_metrics.total_tokens - ) - - @computed_field # type: ignore[misc] - @property - def inter_token_latency_ms(self) -> float | None: - """ - Average inter-token latency in milliseconds. - - Measures time between token generations, excluding first token. - - :return: Average milliseconds between tokens, or None if unavailable. - """ - if ( - not self.info.timings.first_iteration - or not self.info.timings.last_iteration - or not self.output_metrics.total_tokens - or self.output_metrics.total_tokens <= 1 - ): - return None - - return ( - 1000 - * (self.info.timings.last_iteration - self.info.timings.first_iteration) - / (self.output_metrics.total_tokens - 1) - ) - - @computed_field # type: ignore[misc] - @property - def tokens_per_second(self) -> float | None: - """ - Overall token throughput including prompt and output tokens. - - :return: Total tokens per second, or None if unavailable. - """ - if not (latency := self.request_latency) or self.total_tokens is None: - return None - - return self.total_tokens / latency - - @computed_field # type: ignore[misc] - @property - def output_tokens_per_second(self) -> float | None: - """ - Output token generation throughput. - - :return: Output tokens per second, or None if unavailable. - """ - if not (latency := self.request_latency) or self.output_tokens is None: - return None - - return self.output_tokens / latency - - @computed_field # type: ignore[misc] - @property - def output_tokens_per_iteration(self) -> float | None: - """ - Average output tokens generated per iteration. - - :return: Output tokens per iteration, or None if unavailable. - """ - if self.output_tokens is None or not self.info.timings.iterations: - return None - - return self.output_tokens / self.info.timings.iterations diff --git a/src/guidellm/settings.py b/src/guidellm/settings.py index f03b19e2..12c8ef30 100644 --- a/src/guidellm/settings.py +++ b/src/guidellm/settings.py @@ -162,7 +162,7 @@ class Settings(BaseSettings): preferred_output_tokens_source: Literal["request", "response"] = "response" preferred_backend: Literal["openai"] = "openai" preferred_route: Literal["text_completions", "chat_completions"] = ( - "text_completions" + "chat_completions" ) openai: OpenAISettings = OpenAISettings() diff --git a/src/guidellm/utils/__init__.py b/src/guidellm/utils/__init__.py index 89312771..0874c291 100644 --- a/src/guidellm/utils/__init__.py +++ b/src/guidellm/utils/__init__.py @@ -13,6 +13,7 @@ all_defined, safe_add, safe_divide, + safe_format_number, safe_format_timestamp, safe_getattr, safe_multiply, @@ -28,23 +29,9 @@ SendMessageT, ) from .mixins import InfoMixin -from .pydantic_utils import ( - PydanticClassRegistryMixin, - ReloadableBaseModel, - StandardBaseDict, - StandardBaseModel, - StatusBreakdown, -) from .random import IntegerRangeSampler from .registry import RegistryMixin, RegistryObjT from .singleton import SingletonMixin, ThreadSafeSingletonMixin -from .statistics import ( - DistributionSummary, - Percentiles, - RunningStats, - StatusDistributionSummary, - TimeRunningStats, -) from .synchronous import ( wait_for_sync_barrier, wait_for_sync_event, @@ -67,11 +54,9 @@ "SUPPORTED_TYPES", "AutoImporterMixin", "Colors", - "Colors", "Console", "ConsoleUpdateStep", "DefaultGroupHandler", - "DistributionSummary", "Encoder", "EncodingTypesAlias", "EndlessTextCreator", @@ -82,25 +67,15 @@ "InterProcessMessagingPipe", "InterProcessMessagingQueue", "MessageEncoding", - "MessageEncoding", - "Percentiles", - "PydanticClassRegistryMixin", "RegistryMixin", "RegistryObjT", - "ReloadableBaseModel", - "RunningStats", "SendMessageT", "SerializationTypesAlias", "Serializer", "SingletonMixin", - "StandardBaseDict", - "StandardBaseModel", - "StatusBreakdown", - "StatusDistributionSummary", "StatusIcons", "StatusStyles", "ThreadSafeSingletonMixin", - "TimeRunningStats", "all_defined", "camelize_str", "check_load_processor", @@ -114,6 +89,7 @@ "recursive_key_update", "safe_add", "safe_divide", + "safe_format_number", "safe_format_timestamp", "safe_getattr", "safe_multiply", diff --git a/src/guidellm/utils/console.py b/src/guidellm/utils/console.py index 54e90cf7..bdb2da86 100644 --- a/src/guidellm/utils/console.py +++ b/src/guidellm/utils/console.py @@ -1,8 +1,18 @@ +""" +Console utilities for rich terminal output and status updates. + +Provides an extended Rich console with custom formatting for status messages, +progress tracking, and tabular data display. Includes predefined color schemes, +status levels, icons, and styles for consistent terminal output across the +application. Supports multi-step operations with spinners and context managers +for clean progress reporting. +""" + from __future__ import annotations -from collections.abc import Mapping +from collections.abc import Mapping, Sequence from dataclasses import dataclass -from typing import Any, Literal +from typing import Annotated, Any, Literal from rich.console import Console as RichConsole from rich.padding import Padding @@ -14,11 +24,41 @@ "Console", "ConsoleUpdateStep", "StatusIcons", + "StatusLevel", "StatusStyles", ] +StatusLevel = Annotated[ + Literal[ + "debug", + "info", + "warning", + "error", + "critical", + "notset", + "success", + ], + "Status level for console messages indicating severity or state", +] + class Colors: + """ + Color constants for console styling. + + Provides standardized color schemes for different message types and branding. + Colors are defined using Rich console color names or hex values. + + :cvar info: Color for informational messages + :cvar progress: Color for progress indicators + :cvar success: Color for successful operations + :cvar warning: Color for warning messages + :cvar error: Color for error messages + :cvar primary: Primary brand color + :cvar secondary: Secondary brand color + :cvar tertiary: Tertiary brand color + """ + # Core states info: str = "light_steel_blue" progress: str = "dark_slate_gray1" @@ -32,7 +72,10 @@ class Colors: tertiary: str = "#008080" -StatusIcons: Mapping[str, str] = { +StatusIcons: Annotated[ + Mapping[str, str], + "Mapping of status levels to unicode icon characters for visual indicators", +] = { "debug": "…", "info": "ℹ", "warning": "⚠", @@ -42,7 +85,10 @@ class Colors: "success": "✔", } -StatusStyles: Mapping[str, str] = { +StatusStyles: Annotated[ + Mapping[str, str], + "Mapping of status levels to Rich console style strings for colored output", +] = { "debug": "dim", "info": f"bold {Colors.info}", "warning": f"bold {Colors.warning}", @@ -55,95 +101,119 @@ class Colors: @dataclass class ConsoleUpdateStep: + """ + Context manager for multi-step progress operations with spinner. + + Displays animated spinner during operation execution and allows dynamic + status updates. Automatically stops spinner on exit and prints final + status message. Designed for use with Python's `with` statement. + + Example: + :: + console = Console() + with console.print_update_step("Processing data") as step: + step.update("Loading files", "info") + # ... do work ... + step.finish("Completed successfully", status_level="success") + + :param console: The Console instance to use for output + :param title: Initial progress message to display + :param details: Optional additional details to show after completion + :param status_level: Initial status level determining style and icon + :param spinner: Spinner animation style name from Rich's spinner set + """ + console: Console title: str details: Any | None = None - status_level: Literal[ - "debug", - "info", - "warning", - "error", - "critical", - "notset", - "success", - ] = "info" + status_level: StatusLevel = "info" spinner: str = "dots" _status: Status | None = None - def __enter__(self): + def __enter__(self) -> ConsoleUpdateStep: if self.console.quiet: return self + style = StatusStyles.get(self.status_level, "bold") self._status = self.console.status( - f"[{StatusStyles.get(self.status_level, 'bold')}]{self.title}[/]", + f"[{style}]{self.title}[/]", spinner=self.spinner, ) self._status.__enter__() return self - def update( - self, - title: str, - status_level: Literal[ - "debug", - "info", - "warning", - "error", - "critical", - "notset", - "success", - ] - | None = None, - ): + def update(self, title: str, status_level: StatusLevel | None = None): + """ + Update the progress message and optionally the status level. + + :param title: New progress message to display + :param status_level: Optional new status level to apply + """ self.title = title if status_level is not None: self.status_level = status_level + if self._status: - self._status.update( - status=f"[{StatusStyles.get(self.status_level, 'bold')}]{title}[/]" - ) + style = StatusStyles.get(self.status_level, "bold") + self._status.update(status=f"[{style}]{title}[/]") def finish( self, title: str, details: Any | None = None, - status_level: Literal[ - "debug", - "info", - "warning", - "error", - "critical", - "notset", - "success", - ] = "info", + status_level: StatusLevel = "info", ): + """ + Stop the spinner and print the final status message. + + :param title: Final completion message to display + :param details: Optional additional information to show below message + :param status_level: Status level for final message styling + """ self.title = title self.status_level = status_level + if self._status: self._status.stop() + self.console.print_update(title, details, status_level) def __exit__(self, exc_type, exc_val, exc_tb): if self._status: - return self._status.__exit__(exc_type, exc_val, exc_tb) - return False + self._status.__exit__(exc_type, exc_val, exc_tb) class Console(RichConsole): + """ + Extended Rich console with custom formatting and status reporting. + + Enhances Rich's Console with specialized methods for status messages, + progress tracking with spinners, and formatted table output. Provides + consistent styling through predefined status levels, icons, and colors. + Supports quiet mode to suppress non-critical output. + + Example: + :: + console = Console() + console.print_update("Starting process", status="info") + with console.print_update_step("Loading data") as step: + step.update("Processing items") + step.finish("Complete", status_level="success") + """ + def print_update( self, title: str, - details: str | None = None, - status: Literal[ - "debug", - "info", - "warning", - "error", - "critical", - "notset", - "success", - ] = "info", - ) -> None: + details: Any | None = None, + status: StatusLevel = "info", + ): + """ + Print a status message with icon and optional details. + + :param title: Main status message to display + :param details: Optional additional details shown indented below message + :param status: Status level determining icon and styling + """ icon = StatusIcons.get(status, "•") style = StatusStyles.get(status, "bold") line = Text.assemble(f"{icon} ", (title, style)) @@ -151,6 +221,11 @@ def print_update( self.print_update_details(details) def print_update_details(self, details: Any | None): + """ + Print additional details indented below a status message. + + :param details: Content to display, converted to string and styled dimly + """ if details: block = Padding( Text.from_markup(str(details)), @@ -162,18 +237,19 @@ def print_update_details(self, details: Any | None): def print_update_step( self, title: str, - status: Literal[ - "debug", - "info", - "warning", - "error", - "critical", - "notset", - "success", - ] = "info", + status: StatusLevel = "info", details: Any | None = None, spinner: str = "dots", ) -> ConsoleUpdateStep: + """ + Create a context manager for multi-step progress with spinner. + + :param title: Initial progress message to display + :param status: Initial status level for styling + :param details: Optional details to show after completion + :param spinner: Spinner animation style name + :return: ConsoleUpdateStep context manager for progress tracking + """ return ConsoleUpdateStep( console=self, title=title, @@ -181,3 +257,310 @@ def print_update_step( status_level=status, spinner=spinner, ) + + def print_tables( + self, + header_cols_groups: Sequence[Sequence[str | list[str]]], + value_cols_groups: Sequence[Sequence[str | list[str]]], + title: str | None = None, + widths: Sequence[int] | None = None, + ): + """ + Print multiple tables with uniform column widths. + + :param header_cols_groups: List of header column groups for each table + :param value_cols_groups: List of value column groups for each table + :param title: Optional title to display before tables + :param widths: Optional minimum column widths to enforce + """ + if title is not None: + self.print_update(title, None, "info") + + # Format all groups to determine uniform widths + widths = widths or None + headers = [] + values = [] + + # Process all tables to get consistent widths + for value_cols in value_cols_groups: + formatted, widths = self._format_table_columns(value_cols, widths) + values.append(formatted) + for header_cols in header_cols_groups: + formatted, widths = self._format_table_headers(header_cols, widths) + headers.append(formatted) + + # Print each table + for ind, (header, value) in enumerate(zip(headers, values, strict=False)): + is_last = ind == len(headers) - 1 + self.print_table( + header, + value, + widths=widths, + apply_formatting=False, + print_bottom_divider=is_last, + ) + + def print_table( + self, + header_cols: Sequence[str | list[str]], + value_cols: Sequence[str | list[str]], + title: str | None = None, + widths: Sequence[int] | None = None, + apply_formatting: bool = True, + print_bottom_divider: bool = True, + ): + """ + Print a formatted table with headers and values. + + :param header_cols: List of header columns, each string or list of strings + :param value_cols: List of value columns, each string or list of strings + :param title: Optional title to display before table + :param widths: Optional minimum column widths to enforce + :param apply_formatting: Whether to calculate widths and format columns + :param print_bottom_divider: Whether to print bottom border line + """ + if title is not None: + self.print_update(title, None, "info") + + # Format data + values: list[list[str]] + headers: list[list[str]] + final_widths: list[int] + + if apply_formatting: + values, final_widths = self._format_table_columns(value_cols, widths) + headers, final_widths = self._format_table_headers( + header_cols, final_widths + ) + else: + values = [col if isinstance(col, list) else [col] for col in value_cols] + headers = [col if isinstance(col, list) else [col] for col in header_cols] + final_widths = list(widths) if widths else [] + + # Print table structure + self.print_table_divider(final_widths, "=") + self.print_table_headers(headers, final_widths) + self.print_table_divider(final_widths, "-") + self.print_table_values(values, final_widths) + + if print_bottom_divider: + self.print_table_divider(final_widths, "=") + + def print_table_divider(self, widths: Sequence[int], char: str): + """ + Print a horizontal divider line across table columns. + + :param widths: Column widths for divider line + :param char: Character to use for divider line (e.g., '=', '-') + """ + self.print_table_row( + [""] * len(widths), + widths=widths, + spacer=char, + cell_style="bold", + divider_style="bold", + edge_style="bold", + ) + + def print_table_headers(self, headers: Sequence[list[str]], widths: Sequence[int]): + """ + Print header rows with support for column spanning. + + :param headers: List of header columns, each containing header row values + :param widths: Column widths for proper alignment + """ + if not headers or not headers[0]: + return + + for row_idx in range(len(headers[0])): + # Calculate widths for this header row, accounting for merged cells. + row_widths = list(widths) + for col_idx in range(len(headers)): + if not headers[col_idx][row_idx]: + continue + + # Find span end + span_end = col_idx + 1 + while span_end < len(headers) and not headers[span_end][row_idx]: + row_widths[span_end] = 0 + span_end += 1 + + # Set combined width for the first cell in span + row_widths[col_idx] = sum( + widths[col] for col in range(col_idx, span_end) + ) + + # Print the header row + self.print_table_row( + values=[headers[col][row_idx] for col in range(len(headers))], + widths=row_widths, + cell_style="bold", + divider_style="bold", + edge_style="bold", + ) + + def print_table_values(self, values: Sequence[list[str]], widths: Sequence[int]): + """ + Print all data rows in the table. + + :param values: List of value columns, each containing row values + :param widths: Column widths for proper alignment + """ + if not values: + return + + for row_idx in range(len(values[0])): + # Print the value row + self.print_table_row( + values=[values[col][row_idx] for col in range(len(values))], + widths=widths, + divider="|", + edge_style="bold", + ) + + def print_table_row( + self, + values: Sequence[str], + widths: Sequence[int] | None = None, + spacer: str = " ", + divider: str = "|", + cell_style: str = "", + value_style: str = "", + divider_style: str = "", + edge_style: str = "", + ): + """ + Print a single table row with custom styling. + + :param values: Cell values for the row + :param widths: Column widths, defaults to value lengths + :param spacer: Character for padding cells + :param divider: Character separating columns + :param cell_style: Rich style string for entire cells + :param value_style: Rich style string for cell values only + :param divider_style: Rich style string for column dividers + :param edge_style: Rich style string for table edges + """ + widths = widths or [len(val) for val in values] + + # Build styled cells + cells = [] + for val, width in zip(values, widths, strict=True): + cell = val.ljust(width, spacer) + if value_style and val: + cell = cell.replace(val, f"[{value_style}]{val}[/{value_style}]") + if cell_style: + cell = f"[{cell_style}]{cell}[/{cell_style}]" + cells.append(cell) + + # Build and print row + edge = f"[{edge_style}]{divider}[/{edge_style}]" if edge_style else divider + inner = ( + f"[{divider_style}]{divider}[/{divider_style}]" + if divider_style + else divider + ) + line = edge + inner.join(cells) + edge + self.print(line, overflow="ignore", crop=False) + + def _format_table_headers( + self, + headers: Sequence[str | list[str]], + col_widths: Sequence[int] | None = None, + spacer: str = " ", + min_padding: int = 1, + ) -> tuple[list[list[str]], list[int]]: + formatted, header_widths = self._format_table_columns( + headers, col_widths, spacer, min_padding + ) + + if not formatted or not formatted[0]: + return formatted, [] + + # Merge identical adjacent headers row by row + widths = list(col_widths) if col_widths else header_widths + for row_idx in range(len(formatted[0])): + last_value = None + start_col = -1 + + for col_idx in range(len(formatted) + 1): + cur_value = ( + formatted[col_idx][row_idx] if col_idx < len(formatted) else None + ) + + # Check if we should continue merging + if ( + col_idx < len(formatted) + and cur_value != "" + and cur_value == last_value + and ( + row_idx == 0 + or headers[start_col][row_idx - 1] + == headers[col_idx][row_idx - 1] + ) + ): + continue + + # Finalize previous + if start_col >= 0: + # Clear merged cells to keep only the first + for col in range(start_col + 1, col_idx): + formatted[col][row_idx] = "" + + # Adjust widths of columns in the merged span, if needed + if (required := len(formatted[start_col][row_idx])) > ( + current := sum(widths[col] for col in range(start_col, col_idx)) + ): + diff = required - current + cols_count = col_idx - start_col + per_col = diff // cols_count + extra = diff % cols_count + + for col in range(start_col, col_idx): + widths[col] += per_col + if extra > 0: + widths[col] += 1 + extra -= 1 + + # Start new merge + last_value = cur_value + start_col = col_idx + + return formatted, widths + + def _format_table_columns( + self, + columns: Sequence[str | list[str]], + col_widths: Sequence[int] | None = None, + spacer: str = " ", + min_padding: int = 1, + ) -> tuple[list[list[str]], list[int]]: + if not columns: + return [], [] + + # Normalize to list of lists + max_rows = max(len(col) if isinstance(col, list) else 1 for col in columns) + + formatted = [] + for col in columns: + col_list = col if isinstance(col, list) else [col] + # Pad to max height + col_list = col_list + [""] * (max_rows - len(col_list)) + # Add cell padding + padding = spacer * min_padding + col_list = [ + f"{padding}{item}{padding}" if item else "" for item in col_list + ] + formatted.append(col_list) + + # Calculate widths + widths = [max(len(row) for row in col) for col in formatted] + + # Apply minimum widths if provided + if col_widths is not None: + widths = [ + max(width, min_w) + for width, min_w in zip(widths, col_widths, strict=True) + ] + + return formatted, widths diff --git a/src/guidellm/utils/functions.py b/src/guidellm/utils/functions.py index ed4a2075..0633616c 100644 --- a/src/guidellm/utils/functions.py +++ b/src/guidellm/utils/functions.py @@ -15,6 +15,7 @@ "all_defined", "safe_add", "safe_divide", + "safe_format_number", "safe_format_timestamp", "safe_getattr", "safe_multiply", @@ -115,7 +116,7 @@ def safe_add( def safe_format_timestamp( - timestamp: float | None, format_: str = "%H:%M:%S", default: str = "N/A" + timestamp: float | int | None, format_: str = "%H:%M:%S", default: str = "N/A" ) -> str: """ Safely format a timestamp with error handling and validation. @@ -132,3 +133,27 @@ def safe_format_timestamp( return datetime.fromtimestamp(timestamp).strftime(format_) except (ValueError, OverflowError, OSError): return default + + +def safe_format_number( + number: int | float | None, precision: int = 1, default: str = "--" +) -> str: + """ + Safely format a number with specified precision and default handling. + + :param number: Number to format, or None + :param precision: Number of decimal places for formatting floats + :param default: Value to return if number is None + :return: Formatted number string or default value + """ + if number is None: + return default + + if isinstance(number, int): + return str(number) + + try: + format_str = f"{{:.{precision}f}}" + return format_str.format(number) + except (ValueError, TypeError): + return default diff --git a/src/guidellm/utils/statistics.py b/src/guidellm/utils/statistics.py deleted file mode 100644 index a8403c72..00000000 --- a/src/guidellm/utils/statistics.py +++ /dev/null @@ -1,1047 +0,0 @@ -""" -Statistical analysis utilities for distribution calculations and running metrics. - -Provides comprehensive statistical computation tools for analyzing numerical -distributions, percentiles, and streaming data. Includes specialized support for -request timing analysis, concurrency measurement, and rate calculations. Integrates -with Pydantic for serializable statistical models and supports both weighted and -unweighted distributions with cumulative distribution function (CDF) generation. -""" - -from __future__ import annotations - -import math -import time as timer -from collections import defaultdict -from typing import Any, Literal - -import numpy as np -from pydantic import Field, computed_field - -from guidellm.utils.pydantic_utils import StandardBaseModel, StatusBreakdown - -__all__ = [ - "DistributionSummary", - "Percentiles", - "RunningStats", - "StatusDistributionSummary", - "TimeRunningStats", -] - - -class Percentiles(StandardBaseModel): - """ - Standard percentiles model for statistical distribution analysis. - - Provides complete percentile coverage from 0.1th to 99.9th percentiles for - statistical distribution characterization. Used as a component within - DistributionSummary to provide detailed distribution shape analysis. - """ - - p001: float = Field( - description="The 0.1th percentile of the distribution.", - ) - p01: float = Field( - description="The 1st percentile of the distribution.", - ) - p05: float = Field( - description="The 5th percentile of the distribution.", - ) - p10: float = Field( - description="The 10th percentile of the distribution.", - ) - p25: float = Field( - description="The 25th percentile of the distribution.", - ) - p50: float = Field( - description="The 50th percentile of the distribution.", - ) - p75: float = Field( - description="The 75th percentile of the distribution.", - ) - p90: float = Field( - description="The 90th percentile of the distribution.", - ) - p95: float = Field( - description="The 95th percentile of the distribution.", - ) - p99: float = Field( - description="The 99th percentile of the distribution.", - ) - p999: float = Field( - description="The 99.9th percentile of the distribution.", - ) - - -class DistributionSummary(StandardBaseModel): - """ - Comprehensive statistical summary for numerical value distributions. - - Calculates and stores complete statistical metrics including central tendency, - dispersion, extremes, and percentiles for any numerical distribution. Supports - both weighted and unweighted data with optional cumulative distribution function - generation. Primary statistical analysis tool for request timing, performance - metrics, and benchmark result characterization. - - Example: - :: - # Create from simple values - summary = DistributionSummary.from_values([1.0, 2.0, 3.0, 4.0, 5.0]) - print(f"Mean: {summary.mean}, P95: {summary.percentiles.p95}") - - # Create from request timings for concurrency analysis - requests = [(0.0, 1.0), (0.5, 2.0), (1.0, 2.5)] - concurrency = DistributionSummary.from_request_times( - requests, "concurrency" - ) - """ - - mean: float = Field( - description="The mean/average of the distribution.", - ) - median: float = Field( - description="The median of the distribution.", - ) - mode: float = Field( - description="The mode of the distribution.", - ) - variance: float = Field( - description="The variance of the distribution.", - ) - std_dev: float = Field( - description="The standard deviation of the distribution.", - ) - min: float = Field( - description="The minimum value of the distribution.", - ) - max: float = Field( - description="The maximum value of the distribution.", - ) - count: int = Field( - description="The number of values in the distribution.", - ) - total_sum: float = Field( - description="The total sum of the values in the distribution.", - ) - percentiles: Percentiles = Field( - description="The percentiles of the distribution.", - ) - cumulative_distribution_function: list[tuple[float, float]] | None = Field( - description="The cumulative distribution function (CDF) of the distribution.", - default=None, - ) - - @staticmethod - def from_distribution_function( - distribution: list[tuple[float, float]], - include_cdf: bool = False, - ) -> DistributionSummary: - """ - Create statistical summary from weighted distribution or probability function. - - Converts weighted numerical values or probability distribution function (PDF) - into comprehensive statistical summary. Normalizes weights to probabilities - and calculates all statistical metrics including percentiles. - - :param distribution: List of (value, weight) or (value, probability) tuples - representing the distribution - :param include_cdf: Whether to include cumulative distribution function - in the output - :return: DistributionSummary instance with calculated statistical metrics - """ - values, weights = zip(*distribution, strict=True) if distribution else ([], []) - values = np.array(values) # type: ignore[assignment] - weights = np.array(weights) # type: ignore[assignment] - - # create the PDF - probabilities = weights / np.sum(weights) # type: ignore[operator] - pdf = np.column_stack((values, probabilities)) - pdf = pdf[np.argsort(pdf[:, 0])] - values = pdf[:, 0] # type: ignore[assignment] - probabilities = pdf[:, 1] - - # calculate the CDF - cumulative_probabilities = np.cumsum(probabilities) - cdf = np.column_stack((values, cumulative_probabilities)) - - # calculate statistics - mean = np.sum(values * probabilities).item() # type: ignore[attr-defined] - median = cdf[np.argmax(cdf[:, 1] >= 0.5), 0].item() if len(cdf) > 0 else 0 # noqa: PLR2004 - mode = values[np.argmax(probabilities)].item() if len(values) > 0 else 0 # type: ignore[call-overload] - variance = np.sum((values - mean) ** 2 * probabilities).item() # type: ignore[attr-defined] - std_dev = math.sqrt(variance) - minimum = values[0].item() if len(values) > 0 else 0 - maximum = values[-1].item() if len(values) > 0 else 0 - count = len(values) - total_sum = np.sum(values).item() # type: ignore[attr-defined] - - return DistributionSummary( - mean=mean, - median=median, - mode=mode, - variance=variance, - std_dev=std_dev, - min=minimum, - max=maximum, - count=count, - total_sum=total_sum, - percentiles=( - Percentiles( - p001=cdf[np.argmax(cdf[:, 1] >= 0.001), 0].item(), # noqa: PLR2004 - p01=cdf[np.argmax(cdf[:, 1] >= 0.01), 0].item(), # noqa: PLR2004 - p05=cdf[np.argmax(cdf[:, 1] >= 0.05), 0].item(), # noqa: PLR2004 - p10=cdf[np.argmax(cdf[:, 1] >= 0.1), 0].item(), # noqa: PLR2004 - p25=cdf[np.argmax(cdf[:, 1] >= 0.25), 0].item(), # noqa: PLR2004 - p50=cdf[np.argmax(cdf[:, 1] >= 0.50), 0].item(), # noqa: PLR2004 - p75=cdf[np.argmax(cdf[:, 1] >= 0.75), 0].item(), # noqa: PLR2004 - p90=cdf[np.argmax(cdf[:, 1] >= 0.9), 0].item(), # noqa: PLR2004 - p95=cdf[np.argmax(cdf[:, 1] >= 0.95), 0].item(), # noqa: PLR2004 - p99=cdf[np.argmax(cdf[:, 1] >= 0.99), 0].item(), # noqa: PLR2004 - p999=cdf[np.argmax(cdf[:, 1] >= 0.999), 0].item(), # noqa: PLR2004 - ) - if len(cdf) > 0 - else Percentiles( - p001=0, - p01=0, - p05=0, - p10=0, - p25=0, - p50=0, - p75=0, - p90=0, - p95=0, - p99=0, - p999=0, - ) - ), - cumulative_distribution_function=cdf.tolist() if include_cdf else None, - ) - - @staticmethod - def from_values( - values: list[float], - weights: list[float] | None = None, - include_cdf: bool = False, - ) -> DistributionSummary: - """ - Create statistical summary from numerical values with optional weights. - - Wrapper around from_distribution_function for simple value lists. If weights - are not provided, all values are equally weighted. Enables statistical - analysis of any numerical dataset. - - :param values: Numerical values representing the distribution - :param weights: Optional weights for each value. If not provided, all values - are equally weighted - :param include_cdf: Whether to include cumulative distribution function in - the output DistributionSummary - :return: DistributionSummary instance with calculated statistical metrics - :raises ValueError: If values and weights lists have different lengths - """ - if weights is None: - weights = [1.0] * len(values) - - if len(values) != len(weights): - raise ValueError( - "The length of values and weights must be the same.", - ) - - return DistributionSummary.from_distribution_function( - distribution=list(zip(values, weights, strict=True)), - include_cdf=include_cdf, - ) - - @staticmethod - def from_request_times( - requests: list[tuple[float, float]], - distribution_type: Literal["concurrency", "rate"], - weights: list[float] | None = None, - include_cdf: bool = False, - epsilon: float = 1e-6, - ) -> DistributionSummary: - """ - Create statistical summary from request timing data. - - Analyzes request start/end times to calculate concurrency or rate - distributions. Converts timing events into statistical metrics for - performance analysis and load characterization. - - :param requests: List of (start_time, end_time) tuples for each request - :param distribution_type: Type of analysis - "concurrency" for simultaneous - requests or "rate" for completion rates - :param include_cdf: Whether to include cumulative distribution function - :param epsilon: Threshold for merging close timing events - :return: DistributionSummary with timing-based statistical metrics - :raises ValueError: If distribution_type is not "concurrency" or "rate" - """ - if not weights: - weights = [1.0] * len(requests) - - if len(requests) != len(weights): - raise ValueError( - "The length of requests and weights must be the same.", - ) - - # First convert to timing events based on type - events = DistributionSummary._convert_to_timing_events( - requests, distribution_type, weights - ) - - # Combine any events within epsilon of each other for stability - flattened_events = DistributionSummary._combine_events(events, epsilon) - - # Convert events to value distribution function - distribution: dict[float, float] = defaultdict(float) - - if distribution_type == "concurrency": - # For concurrency, convert to active concurrency over time - active = 0.0 - for ind in range(len(flattened_events)): - time, change = flattened_events[ind] - active += change - flattened_events[ind] = (time, active) - - # Then convert to distribution by weighting each concurrency - # by duration to next event (last event is 0 concurrency) - for ind in range(len(flattened_events) - 1): - time, value = flattened_events[ind] - next_time = flattened_events[ind + 1][0] - duration = next_time - time - distribution[value] += duration - elif distribution_type == "rate": - # For rate, convert to distribution by converting each value - # to a rate (value/duration) weighted by duration from previous - # (first event is 0 rate) - for ind in range(1, len(flattened_events)): - time, value = flattened_events[ind] - prev_time = flattened_events[ind - 1][0] - duration = time - prev_time - rate = value / duration if duration > 0 else 0.0 - distribution[rate] += duration - else: - raise ValueError( - f"Invalid distribution_type '{distribution_type}'. " - "Must be 'concurrency' or 'rate'." - ) - - return DistributionSummary.from_distribution_function( - distribution=sorted(distribution.items()), - include_cdf=include_cdf, - ) - - @staticmethod - def _convert_to_timing_events( - requests: list[tuple[float, float]], - distribution_type: Literal["concurrency", "rate"], - weights: list[float], - ) -> list[tuple[float, float]]: - events: list[tuple[float, float]] = [] - - if distribution_type == "concurrency": - # For concurrency, each request adds to concurrency at start - # and subtracts at end - for (start, end), weight in zip(requests, weights, strict=False): - events.append((start, weight)) - events.append((end, -1 * weight)) - elif distribution_type == "rate": - # For rate, each request is added at the end time only - global_start = min(start for start, _ in requests) if requests else 0.0 - events.append((global_start, 0.0)) - for (_, end), weight in zip(requests, weights, strict=False): - events.append((end, weight)) - else: - raise ValueError( - f"Invalid distribution_type '{distribution_type}'. " - "Must be 'concurrency' or 'rate'." - ) - return events - - @staticmethod - def _combine_events( - events: list[tuple[float, float]], - epsilon: float, - ) -> list[tuple[float, float]]: - sorted_events = sorted(events, key=lambda event: event[0]) - flattened_events: list[tuple[float, float]] = ( - [sorted_events.pop(0)] if sorted_events else [] - ) - last_time = flattened_events[0][0] if flattened_events else 0.0 - - for time, val in sorted_events: - if abs(time - last_time) <= epsilon: - last_val = flattened_events[-1][1] - flattened_events[-1] = (last_time, last_val + val) - else: - last_time = time - flattened_events.append((time, val)) - return flattened_events - - @staticmethod - def from_iterable_request_times( - requests: list[tuple[float, float]], - first_iter_times: list[float], - iter_counts: list[int], - first_iter_counts: list[int] | None = None, - include_cdf: bool = False, - epsilon: float = 1e-6, - ) -> DistributionSummary: - """ - Create statistical summary from iterative request timing data. - - Analyzes autoregressive or streaming requests with multiple iterations - between start and end times. Calculates rate distributions based on - iteration timing patterns for LLM token generation analysis. - - :param requests: List of (start_time, end_time) tuples for each request - :param first_iter_times: Times when first iteration was received for - each request - :param iter_counts: Total iteration counts for each request from first - iteration to end - :param first_iter_counts: Iteration counts for first iteration (defaults - to 1 for each request) - :param include_cdf: Whether to include cumulative distribution function - :param epsilon: Threshold for merging close timing events - :return: DistributionSummary with iteration rate statistical metrics - :raises ValueError: If input lists have mismatched lengths - """ - - if first_iter_counts is None: - first_iter_counts = [1] * len(requests) - - if ( - len(requests) != len(first_iter_times) - or len(requests) != len(iter_counts) - or len(requests) != len(first_iter_counts) - ): - raise ValueError( - "requests, first_iter_times, iter_counts, and first_iter_counts must" - "be the same length." - f"Given {len(requests)}, {len(first_iter_times)}, {len(iter_counts)}, " - f"{len(first_iter_counts)}", - ) - - # first break up the requests into individual iterable events - events = defaultdict(int) - global_start = min(start for start, _ in requests) if requests else 0 - global_end = max(end for _, end in requests) if requests else 0 - events[global_start] = 0 - events[global_end] = 0 - - for (_, end), first_iter, first_iter_count, total_count in zip( - requests, first_iter_times, first_iter_counts, iter_counts, strict=True - ): - events[first_iter] += first_iter_count - - if total_count > 1: - iter_latency = (end - first_iter) / (total_count - 1) - for ind in range(1, total_count): - events[first_iter + ind * iter_latency] += 1 - - # combine any events that are very close together - flattened_events: list[tuple[float, int]] = [] - - for time, count in sorted(events.items()): - last_time, last_count = ( - flattened_events[-1] if flattened_events else (None, None) - ) - - if ( - last_time is not None - and last_count is not None - and abs(last_time - time) <= epsilon - ): - flattened_events[-1] = (last_time, last_count + count) - else: - flattened_events.append((time, count)) - - # convert to value distribution function - distribution: dict[float, float] = defaultdict(float) - - for ind in range(len(flattened_events) - 1): - start_time, count = flattened_events[ind] - end_time, _ = flattened_events[ind + 1] - duration = end_time - start_time - rate = count / duration - distribution[rate] += duration - - distribution_list = sorted(distribution.items()) - - return DistributionSummary.from_distribution_function( - distribution=distribution_list, - include_cdf=include_cdf, - ) - - -class StatusDistributionSummary( - StatusBreakdown[ - DistributionSummary, - DistributionSummary, - DistributionSummary, - DistributionSummary, - ] -): - """ - Status-grouped statistical summary for request processing analysis. - - Provides comprehensive statistical analysis grouped by request status (total, - successful, incomplete, errored). Enables performance analysis across different - request outcomes for benchmarking and monitoring applications. Each status - category maintains complete DistributionSummary metrics. - - Example: - :: - status_summary = StatusDistributionSummary.from_values( - value_types=["successful", "error", "successful"], - values=[1.5, 10.0, 2.1] - ) - print(f"Success mean: {status_summary.successful.mean}") - print(f"Error rate: {status_summary.errored.count}") - """ - - @staticmethod - def from_values( - value_types: list[Literal["successful", "incomplete", "error"]], - values: list[float], - weights: list[float] | None = None, - include_cdf: bool = False, - ) -> StatusDistributionSummary: - """ - Create status-grouped statistical summary from values and status types. - - Groups numerical values by request status and calculates complete - statistical summaries for each category. Enables performance analysis - across different request outcomes. - - :param value_types: Status type for each value ("successful", "incomplete", - or "error") - :param values: Numerical values representing the distribution - :param weights: Optional weights for each value (defaults to equal weighting) - :param include_cdf: Whether to include cumulative distribution functions - :return: StatusDistributionSummary with statistics grouped by status - :raises ValueError: If input lists have mismatched lengths or invalid - status types - """ - if any( - type_ not in {"successful", "incomplete", "error"} for type_ in value_types - ): - raise ValueError( - "value_types must be one of 'successful', 'incomplete', or 'error'. " - f"Got {value_types} instead.", - ) - - if weights is None: - weights = [1.0] * len(values) - - if len(value_types) != len(values) or len(value_types) != len(weights): - raise ValueError( - "The length of value_types, values, and weights must be the same.", - ) - - _, successful_values, successful_weights = ( - zip(*successful, strict=True) - if ( - successful := list( - filter( - lambda val: val[0] == "successful", - zip(value_types, values, weights, strict=True), - ) - ) - ) - else ([], [], []) - ) - _, incomplete_values, incomplete_weights = ( - zip(*incomplete, strict=True) - if ( - incomplete := list( - filter( - lambda val: val[0] == "incomplete", - zip(value_types, values, weights, strict=True), - ) - ) - ) - else ([], [], []) - ) - _, errored_values, errored_weights = ( - zip(*errored, strict=True) - if ( - errored := list( - filter( - lambda val: val[0] == "error", - zip(value_types, values, weights, strict=True), - ) - ) - ) - else ([], [], []) - ) - - return StatusDistributionSummary( - total=DistributionSummary.from_values( - values, - weights, - include_cdf=include_cdf, - ), - successful=DistributionSummary.from_values( - successful_values, # type: ignore[arg-type] - successful_weights, # type: ignore[arg-type] - include_cdf=include_cdf, - ), - incomplete=DistributionSummary.from_values( - incomplete_values, # type: ignore[arg-type] - incomplete_weights, # type: ignore[arg-type] - include_cdf=include_cdf, - ), - errored=DistributionSummary.from_values( - errored_values, # type: ignore[arg-type] - errored_weights, # type: ignore[arg-type] - include_cdf=include_cdf, - ), - ) - - @staticmethod - def from_request_times( - request_types: list[Literal["successful", "incomplete", "error"]], - requests: list[tuple[float, float]], - distribution_type: Literal["concurrency", "rate"], - weights: list[float] | None = None, - include_cdf: bool = False, - epsilon: float = 1e-6, - ) -> StatusDistributionSummary: - """ - Create status-grouped statistical summary from request timing data. - - Analyzes request timings grouped by status to calculate concurrency or - rate distributions for each outcome category. Enables comparative - performance analysis across successful, incomplete, and errored requests. - - :param request_types: Status type for each request ("successful", - "incomplete", or "error") - :param requests: List of (start_time, end_time) tuples for each request - :param distribution_type: Analysis type - "concurrency" or "rate" - :param include_cdf: Whether to include cumulative distribution functions - :param epsilon: Threshold for merging close timing events - :return: StatusDistributionSummary with timing statistics by status - :raises ValueError: If input lists have mismatched lengths or invalid types - """ - if distribution_type not in {"concurrency", "rate"}: - raise ValueError( - f"Invalid distribution_type '{distribution_type}'. " - "Must be 'concurrency' or 'rate'." - ) - - if any( - type_ not in {"successful", "incomplete", "error"} - for type_ in request_types - ): - raise ValueError( - "request_types must be one of 'successful', 'incomplete', or 'error'. " - f"Got {request_types} instead.", - ) - - if len(request_types) != len(requests): - raise ValueError( - "The length of request_types and requests must be the same. " - f"Got {len(request_types)} and {len(requests)} instead.", - ) - - if weights is None: - weights = [1.0] * len(requests) - - if len(requests) != len(weights): - raise ValueError( - "The length of requests and weights must be the same." - f"Got {len(requests)} and {len(weights)} instead.", - ) - - _, successful_requests, successful_weights = ( - zip(*successful, strict=False) - if ( - successful := list( - filter( - lambda val: val[0] == "successful", - zip(request_types, requests, weights, strict=False), - ) - ) - ) - else ([], [], []) - ) - _, incomplete_requests, incomplete_weights = ( - zip(*incomplete, strict=False) - if ( - incomplete := list( - filter( - lambda val: val[0] == "incomplete", - zip(request_types, requests, weights, strict=False), - ) - ) - ) - else ([], [], []) - ) - _, errored_requests, errored_weights = ( - zip(*errored, strict=False) - if ( - errored := list( - filter( - lambda val: val[0] == "error", - zip(request_types, requests, weights, strict=False), - ) - ) - ) - else ([], [], []) - ) - - return StatusDistributionSummary( - total=DistributionSummary.from_request_times( - requests, - distribution_type=distribution_type, - weights=weights, - include_cdf=include_cdf, - epsilon=epsilon, - ), - successful=DistributionSummary.from_request_times( - successful_requests, # type: ignore[arg-type] - distribution_type=distribution_type, - weights=successful_weights, # type: ignore[arg-type] - include_cdf=include_cdf, - epsilon=epsilon, - ), - incomplete=DistributionSummary.from_request_times( - incomplete_requests, # type: ignore[arg-type] - distribution_type=distribution_type, - weights=incomplete_weights, # type: ignore[arg-type] - include_cdf=include_cdf, - epsilon=epsilon, - ), - errored=DistributionSummary.from_request_times( - errored_requests, # type: ignore[arg-type] - distribution_type=distribution_type, - weights=errored_weights, # type: ignore[arg-type] - include_cdf=include_cdf, - epsilon=epsilon, - ), - ) - - @staticmethod - def from_iterable_request_times( - request_types: list[Literal["successful", "incomplete", "error"]], - requests: list[tuple[float, float]], - first_iter_times: list[float], - iter_counts: list[int] | None = None, - first_iter_counts: list[int] | None = None, - include_cdf: bool = False, - epsilon: float = 1e-6, - ) -> StatusDistributionSummary: - """ - Create status-grouped statistical summary from iterative request timing data. - - Analyzes autoregressive request timings grouped by status to calculate - iteration rate distributions for each outcome category. Enables comparative - analysis of token generation or streaming response performance across - different request statuses. - - :param request_types: Status type for each request ("successful", - "incomplete", or "error") - :param requests: List of (start_time, end_time) tuples for each request - :param first_iter_times: Times when first iteration was received for - each request - :param iter_counts: Total iteration counts for each request (defaults to 1) - :param first_iter_counts: Iteration counts for first iteration (defaults - to 1) - :param include_cdf: Whether to include cumulative distribution functions - :param epsilon: Threshold for merging close timing events - :return: StatusDistributionSummary with iteration statistics by status - :raises ValueError: If input lists have mismatched lengths or invalid types - """ - if any( - type_ not in {"successful", "incomplete", "error"} - for type_ in request_types - ): - raise ValueError( - "request_types must be one of 'successful', 'incomplete', or 'error'. " - f"Got {request_types} instead.", - ) - - if iter_counts is None: - iter_counts = [1] * len(requests) - - if first_iter_counts is None: - first_iter_counts = [1] * len(requests) - - if ( - len(request_types) != len(requests) - or len(requests) != len(first_iter_times) - or len(requests) != len(iter_counts) - or len(requests) != len(first_iter_counts) - ): - raise ValueError( - "request_types, requests, first_iter_times, iter_counts, and " - "first_iter_counts must be the same length." - f"Given {len(request_types)}, {len(requests)}, " - f"{len(first_iter_times)}, {len(iter_counts)}, " - f"{len(first_iter_counts)}", - ) - - ( - _, - successful_requests, - successful_first_iter_times, - successful_iter_counts, - successful_first_iter_counts, - ) = ( - zip(*successful, strict=True) - if ( - successful := list( - filter( - lambda val: val[0] == "successful", - zip( - request_types, - requests, - first_iter_times, - iter_counts, - first_iter_counts, - strict=True, - ), - ) - ) - ) - else ([], [], [], [], []) - ) - ( - _, - incomplete_requests, - incomplete_first_iter_times, - incomplete_iter_counts, - incomplete_first_iter_counts, - ) = ( - zip(*incomplete, strict=True) - if ( - incomplete := list( - filter( - lambda val: val[0] == "incomplete", - zip( - request_types, - requests, - first_iter_times, - iter_counts, - first_iter_counts, - strict=True, - ), - ) - ) - ) - else ([], [], [], [], []) - ) - ( - _, - errored_requests, - errored_first_iter_times, - errored_iter_counts, - errored_first_iter_counts, - ) = ( - zip(*errored, strict=True) - if ( - errored := list( - filter( - lambda val: val[0] == "error", - zip( - request_types, - requests, - first_iter_times, - iter_counts, - first_iter_counts, - strict=True, - ), - ) - ) - ) - else ([], [], [], [], []) - ) - - return StatusDistributionSummary( - total=DistributionSummary.from_iterable_request_times( - requests, - first_iter_times, - iter_counts, - first_iter_counts, - include_cdf=include_cdf, - epsilon=epsilon, - ), - successful=DistributionSummary.from_iterable_request_times( - successful_requests, # type: ignore[arg-type] - successful_first_iter_times, # type: ignore[arg-type] - successful_iter_counts, # type: ignore[arg-type] - successful_first_iter_counts, # type: ignore[arg-type] - include_cdf=include_cdf, - epsilon=epsilon, - ), - incomplete=DistributionSummary.from_iterable_request_times( - incomplete_requests, # type: ignore[arg-type] - incomplete_first_iter_times, # type: ignore[arg-type] - incomplete_iter_counts, # type: ignore[arg-type] - incomplete_first_iter_counts, # type: ignore[arg-type] - include_cdf=include_cdf, - epsilon=epsilon, - ), - errored=DistributionSummary.from_iterable_request_times( - errored_requests, # type: ignore[arg-type] - errored_first_iter_times, # type: ignore[arg-type] - errored_iter_counts, # type: ignore[arg-type] - errored_first_iter_counts, # type: ignore[arg-type] - include_cdf=include_cdf, - epsilon=epsilon, - ), - ) - - -class RunningStats(StandardBaseModel): - """ - Real-time statistics tracking for streaming numerical data. - - Maintains mean, rate, and cumulative statistics for continuous data streams - without storing individual values. Optimized for memory efficiency in - long-running monitoring applications. Supports arithmetic operators for - convenient value addition and provides computed properties for derived metrics. - - Example: - :: - stats = RunningStats() - stats += 10.5 # Add value using operator - stats.update(20.0, count=3) # Add value with custom count - print(f"Mean: {stats.mean}, Rate: {stats.rate}") - """ - - start_time: float = Field( - default_factory=timer.time, - description=( - "The time the running statistics object was created. " - "This is used to calculate the rate of the statistics." - ), - ) - count: int = Field( - default=0, - description="The number of values added to the running statistics.", - ) - total: float = Field( - default=0.0, - description="The total sum of the values added to the running statistics.", - ) - last: float = Field( - default=0.0, - description="The last value added to the running statistics.", - ) - - @computed_field # type: ignore[misc] - @property - def mean(self) -> float: - """ - :return: The mean of the running statistics (total / count). - If count is 0, return 0.0. - """ - if self.count == 0: - return 0.0 - return self.total / self.count - - @computed_field # type: ignore[misc] - @property - def rate(self) -> float: - """ - :return: The rate of the running statistics - (total / (time.time() - start_time)). - If count is 0, return 0.0. - """ - if self.count == 0: - return 0.0 - return self.total / (timer.time() - self.start_time) - - def __add__(self, value: Any) -> float: - """ - Add value using + operator and return current mean. - - :param value: Numerical value to add to the running statistics - :return: Updated mean after adding the value - :raises ValueError: If value is not numeric (int or float) - """ - if not isinstance(value, int | float): - raise ValueError( - f"Value must be an int or float, got {type(value)} instead.", - ) - - self.update(value) - - return self.mean - - def __iadd__(self, value: Any) -> RunningStats: - """ - Add value using += operator and return updated instance. - - :param value: Numerical value to add to the running statistics - :return: Self reference for method chaining - :raises ValueError: If value is not numeric (int or float) - """ - if not isinstance(value, int | float): - raise ValueError( - f"Value must be an int or float, got {type(value)} instead.", - ) - - self.update(value) - - return self - - def update(self, value: float, count: int = 1) -> None: - """ - Update running statistics with new value and count. - - :param value: Numerical value to add to the running statistics - :param count: Number of occurrences to count for this value (defaults to 1) - """ - self.count += count - self.total += value - self.last = value - - -class TimeRunningStats(RunningStats): - """ - Specialized running statistics for time-based measurements. - - Extends RunningStats with time-specific computed properties for millisecond - conversions. Designed for tracking latency, duration, and timing metrics in - performance monitoring applications. - - Example: - :: - time_stats = TimeRunningStats() - time_stats += 0.125 # Add 125ms in seconds - print(f"Mean: {time_stats.mean_ms}ms, Total: {time_stats.total_ms}ms") - """ - - @computed_field # type: ignore[misc] - @property - def total_ms(self) -> float: - """ - :return: The total time multiplied by 1000.0 to convert to milliseconds. - """ - return self.total * 1000.0 - - @computed_field # type: ignore[misc] - @property - def last_ms(self) -> float: - """ - :return: The last time multiplied by 1000.0 to convert to milliseconds. - """ - return self.last * 1000.0 - - @computed_field # type: ignore[misc] - @property - def mean_ms(self) -> float: - """ - :return: The mean time multiplied by 1000.0 to convert to milliseconds. - """ - return self.mean * 1000.0 - - @computed_field # type: ignore[misc] - @property - def rate_ms(self) -> float: - """ - :return: The rate of the running statistics multiplied by 1000.0 - to convert to milliseconds. - """ - return self.rate * 1000.0 diff --git a/tests/unit/benchmark/test_output.py b/tests/unit/benchmark/test_output.py index 416a9b2b..590f40f0 100644 --- a/tests/unit/benchmark/test_output.py +++ b/tests/unit/benchmark/test_output.py @@ -10,7 +10,7 @@ from guidellm.benchmark import ( GenerativeBenchmarksReport, ) -from guidellm.benchmark.output import ( +from guidellm.benchmark.outputs.output import ( GenerativeBenchmarkerConsole, GenerativeBenchmarkerCSV, ) diff --git a/tests/unit/mock_benchmark.py b/tests/unit/mock_benchmark.py index 0546d28f..9da4227d 100644 --- a/tests/unit/mock_benchmark.py +++ b/tests/unit/mock_benchmark.py @@ -1,7 +1,7 @@ """Mock benchmark objects for unit testing.""" from guidellm.benchmark import ( - BenchmarkSchedulerStats, + BenchmarkSchedulerMetrics, GenerativeBenchmark, GenerativeMetrics, ) @@ -113,7 +113,7 @@ def mock_generative_benchmark() -> GenerativeBenchmark: ), env_args=StandardBaseDict(), extras=StandardBaseDict(), - run_stats=BenchmarkSchedulerStats( + run_stats=BenchmarkSchedulerMetrics( start_time=1, end_time=2, requests_made=StatusBreakdown( diff --git a/tests/unit/presentation/__init__.py b/tests/unit/presentation/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/unit/presentation/test_data_models.py b/tests/unit/presentation/test_data_models.py deleted file mode 100644 index c1663c43..00000000 --- a/tests/unit/presentation/test_data_models.py +++ /dev/null @@ -1,20 +0,0 @@ -import pytest - -from guidellm.presentation.data_models import Bucket - - -@pytest.mark.smoke -def test_bucket_from_data(): - buckets, bucket_width = Bucket.from_data([8, 8, 8, 8, 8, 8], 1) - assert len(buckets) == 1 - assert buckets[0].value == 8.0 - assert buckets[0].count == 6 - assert bucket_width == 1 - - buckets, bucket_width = Bucket.from_data([8, 8, 8, 8, 8, 7], 1) - assert len(buckets) == 2 - assert buckets[0].value == 7.0 - assert buckets[0].count == 1 - assert buckets[1].value == 8.0 - assert buckets[1].count == 5 - assert bucket_width == 1 diff --git a/tests/unit/presentation/test_injector.py b/tests/unit/presentation/test_injector.py deleted file mode 100644 index da269815..00000000 --- a/tests/unit/presentation/test_injector.py +++ /dev/null @@ -1,87 +0,0 @@ -from pathlib import Path - -import pytest -from pydantic import BaseModel - -from guidellm.presentation.injector import create_report, inject_data -from guidellm.settings import settings - - -class ExampleModel(BaseModel): - name: str - version: str - - -@pytest.mark.smoke -def test_inject_data(): - html = "" - expected_html = ( - "" - ) - js_data = { - "window.runInfo = {};": "window.runInfo =" - '{ "model": { "name": "neuralmagic/Qwen2.5-7B-quantized.w8a8" } };' - } - result = inject_data( - js_data, - html, - ) - assert result == expected_html - - -@pytest.mark.smoke -def test_create_report_to_file(tmpdir): - js_data = { - "window.runInfo = {};": "window.runInfo =" - '{ "model": { "name": "neuralmagic/Qwen2.5-7B-quantized.w8a8" } };' - } - html_content = "" - expected_html_content = ( - "" - ) - - mock_html_path = tmpdir.join("template.html") - mock_html_path.write(html_content) - settings.report_generation.source = str(mock_html_path) - - output_path = tmpdir.join("output.html") - result_path = create_report(js_data, str(output_path)) - result_content = result_path.read_text() - - assert result_path == output_path - assert result_content == expected_html_content - - -@pytest.mark.smoke -def test_create_report_with_file_nested_in_dir(tmpdir): - js_data = { - "window.runInfo = {};": "window.runInfo =" - '{ "model": { "name": "neuralmagic/Qwen2.5-7B-quantized.w8a8" } };' - } - html_content = "" - expected_html_content = ( - "" - ) - - output_dir = tmpdir.mkdir("output_dir") - mock_html_path = tmpdir.join("template.html") - mock_html_path.write(html_content) - settings.report_generation.source = str(mock_html_path) - - output_path = Path(output_dir) / "report.html" - result_path = create_report(js_data, str(output_path)) - - with Path(result_path).open("r") as file: - result_content = file.read() - - assert result_path == output_path - assert result_content == expected_html_content diff --git a/tests/unit/utils/test_pydantic_utils.py b/tests/unit/utils/test_pydantic_utils.py index b1278f51..d57b0663 100644 --- a/tests/unit/utils/test_pydantic_utils.py +++ b/tests/unit/utils/test_pydantic_utils.py @@ -10,14 +10,7 @@ import pytest from pydantic import BaseModel, Field, ValidationError -from guidellm.utils import ( - PydanticClassRegistryMixin, - ReloadableBaseModel, - StandardBaseDict, - StandardBaseModel, - StatusBreakdown, -) -from guidellm.utils.pydantic_utils import ( +from guidellm.schemas.base import ( BaseModelT, ErroredT, IncompleteT, @@ -25,6 +18,13 @@ SuccessfulT, TotalT, ) +from guidellm.utils import ( + PydanticClassRegistryMixin, + ReloadableBaseModel, + StandardBaseDict, + StandardBaseModel, + StatusBreakdown, +) @pytest.mark.smoke diff --git a/tests/unit/utils/test_statistics.py b/tests/unit/utils/test_statistics.py index d0f04d99..73c383f2 100644 --- a/tests/unit/utils/test_statistics.py +++ b/tests/unit/utils/test_statistics.py @@ -1,785 +1,1123 @@ +from __future__ import annotations + import math -import time from typing import Literal import numpy as np import pytest +from pydantic import BaseModel, ValidationError -from guidellm.utils.statistics import ( - DistributionSummary, - Percentiles, - RunningStats, - StatusDistributionSummary, - TimeRunningStats, -) - - -def create_default_percentiles() -> Percentiles: - return Percentiles( - p001=0.1, - p01=1.0, - p05=5.0, - p10=10.0, - p25=25.0, - p50=50.0, - p75=75.0, - p90=90.0, - p95=95.0, - p99=99.0, - p999=99.9, - ) - - -def create_default_distribution_summary() -> DistributionSummary: - return DistributionSummary( - mean=50.0, - median=50.0, - mode=50.0, - variance=835, - std_dev=math.sqrt(835), - min=0.0, - max=100.0, - count=1001, - total_sum=50050.0, - percentiles=create_default_percentiles(), - ) - - -@pytest.mark.smoke -def test_percentiles_initialization(): - percentiles = create_default_percentiles() - assert percentiles.p001 == 0.1 - assert percentiles.p01 == 1.0 - assert percentiles.p05 == 5.0 - assert percentiles.p10 == 10.0 - assert percentiles.p25 == 25.0 - assert percentiles.p50 == 50.0 - assert percentiles.p75 == 75.0 - assert percentiles.p90 == 90.0 - assert percentiles.p95 == 95.0 - assert percentiles.p99 == 99.0 - assert percentiles.p999 == 99.9 - - -@pytest.mark.smoke -def test_percentiles_invalid_initialization(): - test_kwargs = { - "p001": 0.1, - "p01": 1.0, - "p05": 5.0, - "p10": 10.0, - "p25": 25.0, - "p50": 50.0, - "p75": 75.0, - "p90": 90.0, - "p95": 95.0, - "p99": 99.0, - "p999": 99.9, - } - test_missing_keys = list(test_kwargs.keys()) - - for missing_key in test_missing_keys: - kwargs = {key: val for key, val in test_kwargs.items() if key != missing_key} - with pytest.raises(ValueError): - Percentiles(**kwargs) - - -@pytest.mark.smoke -def test_percentiles_marshalling(): - percentiles = create_default_percentiles() - serialized = percentiles.model_dump() - deserialized = Percentiles.model_validate(serialized) - - for key, value in vars(percentiles).items(): - assert getattr(deserialized, key) == value - - -@pytest.mark.smoke -def test_distribution_summary_initilaization(): - distribution_summary = create_default_distribution_summary() - assert distribution_summary.mean == 50.0 - assert distribution_summary.median == 50.0 - assert distribution_summary.mode == 50.0 - assert distribution_summary.variance == 835 - assert distribution_summary.std_dev == math.sqrt(835) - assert distribution_summary.min == 0.0 - assert distribution_summary.max == 100.0 - assert distribution_summary.count == 1001 - assert distribution_summary.total_sum == 50050.0 - assert distribution_summary.percentiles.p001 == 0.1 - assert distribution_summary.percentiles.p01 == 1.0 - assert distribution_summary.percentiles.p05 == 5.0 - assert distribution_summary.percentiles.p10 == 10.0 - assert distribution_summary.percentiles.p25 == 25.0 - assert distribution_summary.percentiles.p50 == 50.0 - assert distribution_summary.percentiles.p75 == 75.0 - assert distribution_summary.percentiles.p90 == 90.0 - assert distribution_summary.percentiles.p95 == 95.0 - assert distribution_summary.percentiles.p99 == 99.0 - assert distribution_summary.percentiles.p999 == 99.9 - - -@pytest.mark.smoke -def test_distribution_summary_invalid_initialization(): - test_kwargs = { - "mean": 50.0, - "median": 50.0, - "mode": 50.0, - "variance": 835, - "std_dev": math.sqrt(835), - "min": 0.0, - "max": 100.0, - "count": 1001, - "total_sum": 50050.0, - "percentiles": create_default_percentiles(), - } - test_missing_keys = list(test_kwargs.keys()) - for missing_key in test_missing_keys: - kwargs = {key: val for key, val in test_kwargs.items() if key != missing_key} - with pytest.raises(ValueError): - DistributionSummary(**kwargs) # type: ignore[arg-type] - - -@pytest.mark.smoke -def test_distribution_summary_marshalling(): - distribution_summary = create_default_distribution_summary() - serialized = distribution_summary.model_dump() - deserialized = DistributionSummary.model_validate(serialized) - - for key, value in vars(distribution_summary).items(): - assert getattr(deserialized, key) == value - - -@pytest.mark.smoke -def test_distribution_summary_from_distribution_function(): - values = [val / 10.0 for val in range(1001)] - distribution = [(val, 1.0) for val in values] - distribution_summary = DistributionSummary.from_distribution_function(distribution) - assert distribution_summary.mean == pytest.approx(np.mean(values)) - assert distribution_summary.median == pytest.approx(np.median(values)) - assert distribution_summary.mode == 0.0 - assert distribution_summary.variance == pytest.approx(np.var(values, ddof=0)) - assert distribution_summary.std_dev == pytest.approx(np.std(values, ddof=0)) - assert distribution_summary.min == min(values) - assert distribution_summary.max == max(values) - assert distribution_summary.count == len(values) - assert distribution_summary.total_sum == sum(values) - assert distribution_summary.percentiles.p001 == pytest.approx( - np.percentile(values, 0.1) - ) - assert distribution_summary.percentiles.p01 == pytest.approx( - np.percentile(values, 1.0) - ) - assert distribution_summary.percentiles.p05 == pytest.approx( - np.percentile(values, 5.0) - ) - assert distribution_summary.percentiles.p10 == pytest.approx( - np.percentile(values, 10.0) - ) - assert distribution_summary.percentiles.p25 == pytest.approx( - np.percentile(values, 25.0) - ) - assert distribution_summary.percentiles.p50 == pytest.approx( - np.percentile(values, 50.0) - ) - assert distribution_summary.percentiles.p75 == pytest.approx( - np.percentile(values, 75.0) - ) - assert distribution_summary.percentiles.p90 == pytest.approx( - np.percentile(values, 90.0) - ) - assert distribution_summary.percentiles.p95 == pytest.approx( - np.percentile(values, 95.0) - ) - assert distribution_summary.percentiles.p99 == pytest.approx( - np.percentile(values, 99.0) - ) - assert distribution_summary.percentiles.p999 == pytest.approx( - np.percentile(values, 99.9) - ) - assert distribution_summary.cumulative_distribution_function is None - - distribution_summary_cdf = DistributionSummary.from_distribution_function( - distribution, include_cdf=True - ) - assert distribution_summary_cdf.cumulative_distribution_function is not None - assert len(distribution_summary_cdf.cumulative_distribution_function) == len(values) - - -def test_distribution_summary_from_values(): - values = [val / 10 for val in range(1001)] - distribution_summary = DistributionSummary.from_values(values) - assert distribution_summary.mean == pytest.approx(np.mean(values)) - assert distribution_summary.median == pytest.approx(np.median(values)) - assert distribution_summary.mode == 0.0 - assert distribution_summary.variance == pytest.approx(np.var(values, ddof=0)) - assert distribution_summary.std_dev == pytest.approx(np.std(values, ddof=0)) - assert distribution_summary.min == min(values) - assert distribution_summary.max == max(values) - assert distribution_summary.count == len(values) - assert distribution_summary.total_sum == sum(values) - assert distribution_summary.percentiles.p001 == pytest.approx( - np.percentile(values, 0.1) - ) - assert distribution_summary.percentiles.p01 == pytest.approx( - np.percentile(values, 1.0) - ) - assert distribution_summary.percentiles.p05 == pytest.approx( - np.percentile(values, 5.0) - ) - assert distribution_summary.percentiles.p10 == pytest.approx( - np.percentile(values, 10.0) - ) - assert distribution_summary.percentiles.p25 == pytest.approx( - np.percentile(values, 25.0) - ) - assert distribution_summary.percentiles.p50 == pytest.approx( - np.percentile(values, 50.0) - ) - assert distribution_summary.percentiles.p75 == pytest.approx( - np.percentile(values, 75.0) - ) - assert distribution_summary.percentiles.p90 == pytest.approx( - np.percentile(values, 90.0) - ) - assert distribution_summary.percentiles.p95 == pytest.approx( - np.percentile(values, 95.0) - ) - assert distribution_summary.percentiles.p99 == pytest.approx( - np.percentile(values, 99.0) - ) - assert distribution_summary.percentiles.p999 == pytest.approx( - np.percentile(values, 99.9) - ) - assert distribution_summary.cumulative_distribution_function is None +from guidellm.utils import DistributionSummary, Percentiles, StatusDistributionSummary - distribution_summary_weights = DistributionSummary.from_values( - values, weights=[2] * len(values) - ) - assert distribution_summary_weights.mean == pytest.approx(np.mean(values)) - assert distribution_summary_weights.median == pytest.approx(np.median(values)) - assert distribution_summary_weights.mode == 0.0 - assert distribution_summary_weights.variance == pytest.approx( - np.var(values, ddof=0) - ) - assert distribution_summary_weights.std_dev == pytest.approx(np.std(values, ddof=0)) - assert distribution_summary_weights.min == min(values) - assert distribution_summary_weights.max == max(values) - assert distribution_summary_weights.count == len(values) - assert distribution_summary_weights.total_sum == sum(values) - assert distribution_summary_weights.cumulative_distribution_function is None - - distribution_summary_cdf = DistributionSummary.from_values(values, include_cdf=True) - assert distribution_summary_cdf.cumulative_distribution_function is not None - assert len(distribution_summary_cdf.cumulative_distribution_function) == len(values) - - -def test_distribution_summary_from_request_times_concurrency(): - # create consistent timestamped values matching a rate of 10 per second - requests = [(val / 10, val / 10 + 1) for val in range(10001)] - distribution_summary = DistributionSummary.from_request_times( - requests, distribution_type="concurrency" - ) - assert distribution_summary.mean == pytest.approx(10.0, abs=0.01) - assert distribution_summary.median == pytest.approx(10.0) - assert distribution_summary.mode == 10.0 - assert distribution_summary.variance == pytest.approx(0, abs=0.1) - assert distribution_summary.std_dev == pytest.approx(0, abs=0.3) - assert distribution_summary.min == pytest.approx(1) - assert distribution_summary.max == pytest.approx(10.0) - assert distribution_summary.count == 10 - assert distribution_summary.total_sum == pytest.approx(55.0) - assert distribution_summary.percentiles.p001 == pytest.approx(10, abs=5) - assert distribution_summary.percentiles.p01 == pytest.approx(10) - assert distribution_summary.percentiles.p05 == pytest.approx(10) - assert distribution_summary.percentiles.p10 == pytest.approx(10) - assert distribution_summary.percentiles.p25 == pytest.approx(10) - assert distribution_summary.percentiles.p50 == pytest.approx(10) - assert distribution_summary.percentiles.p75 == pytest.approx(10) - assert distribution_summary.percentiles.p90 == pytest.approx(10) - assert distribution_summary.percentiles.p95 == pytest.approx(10) - assert distribution_summary.percentiles.p99 == pytest.approx(10) - assert distribution_summary.percentiles.p999 == pytest.approx(10) - assert distribution_summary.cumulative_distribution_function is None - - distribution_summary_cdf = DistributionSummary.from_request_times( - requests, distribution_type="concurrency", include_cdf=True - ) - assert distribution_summary_cdf.cumulative_distribution_function is not None - assert len(distribution_summary_cdf.cumulative_distribution_function) == 10 +def generate_pdf( + distribution: str | None, distribution_args: dict, size: int +) -> np.ndarray: + if distribution is None: + return np.empty((0, 2)) -def test_distribution_summary_from_request_times_rate(): - # create consistent timestamped values matching a rate of 10 per second - requests = [(val / 10, val / 10 + 1) for val in range(10001)] - distribution_summary = DistributionSummary.from_request_times( - requests, distribution_type="rate" - ) - assert distribution_summary.mean == pytest.approx(10.0, abs=0.01) - assert distribution_summary.median == pytest.approx(10.0) - assert distribution_summary.mode == pytest.approx(10.0) - assert distribution_summary.variance == pytest.approx(0, abs=0.1) - assert distribution_summary.std_dev == pytest.approx(0, abs=0.3) - assert distribution_summary.min == pytest.approx(1.0) - assert distribution_summary.max == pytest.approx(10.0) - assert distribution_summary.count == 12 - assert distribution_summary.total_sum == pytest.approx(111.0) - assert distribution_summary.percentiles.p001 == pytest.approx(10.0, abs=0.5) - assert distribution_summary.percentiles.p01 == pytest.approx(10.0) - assert distribution_summary.percentiles.p05 == pytest.approx(10.0) - assert distribution_summary.percentiles.p10 == pytest.approx(10.0) - assert distribution_summary.percentiles.p25 == pytest.approx(10.0) - assert distribution_summary.percentiles.p50 == pytest.approx(10.0) - assert distribution_summary.percentiles.p75 == pytest.approx(10.0) - assert distribution_summary.percentiles.p90 == pytest.approx(10.0) - assert distribution_summary.percentiles.p95 == pytest.approx(10.0) - assert distribution_summary.percentiles.p99 == pytest.approx(10.0) - assert distribution_summary.percentiles.p999 == pytest.approx(10.0) - assert distribution_summary.cumulative_distribution_function is None - - distribution_summary_cdf = DistributionSummary.from_request_times( - requests, distribution_type="rate", include_cdf=True - ) - assert distribution_summary_cdf.cumulative_distribution_function is not None - assert len(distribution_summary_cdf.cumulative_distribution_function) == 12 - - -def test_distribution_summary_from_iterable_request_times(): - # create consistent timestamped values matching a rate of 10 per second - requests = [(val / 10, val / 10 + 1) for val in range(10001)] - # create 9 iterations for each request with first iter at start + 0.1 - # and spaced at 0.1 seconds apart - first_iter_times = [val / 10 + 0.1 for val in range(10001)] - iter_counts = [9 for _ in range(10001)] - first_iter_counts = [1 for _ in range(10001)] - - distribution_summary = DistributionSummary.from_iterable_request_times( - requests, first_iter_times, iter_counts, first_iter_counts - ) - assert distribution_summary.mean == pytest.approx(90.0, abs=0.1) - assert distribution_summary.median == pytest.approx(80.0) - assert distribution_summary.mode == pytest.approx(80.0) - assert distribution_summary.variance == pytest.approx(704.463, abs=0.001) - assert distribution_summary.std_dev == pytest.approx(26.541, abs=0.001) - assert distribution_summary.min == pytest.approx(0.0) - assert distribution_summary.max == pytest.approx(160.0) - assert distribution_summary.count == 44 - assert distribution_summary.total_sum == pytest.approx(3538.85, abs=0.01) - assert distribution_summary.percentiles.p001 == pytest.approx(80.0) - assert distribution_summary.percentiles.p01 == pytest.approx(80.0) - assert distribution_summary.percentiles.p05 == pytest.approx(80.0) - assert distribution_summary.percentiles.p10 == pytest.approx(80.0) - assert distribution_summary.percentiles.p25 == pytest.approx(80.0) - assert distribution_summary.percentiles.p50 == pytest.approx(80.0) - assert distribution_summary.percentiles.p75 == pytest.approx(80.0) - assert distribution_summary.percentiles.p90 == pytest.approx(160.0) - assert distribution_summary.percentiles.p95 == pytest.approx(160.0) - assert distribution_summary.percentiles.p99 == pytest.approx(160.0) - assert distribution_summary.percentiles.p999 == pytest.approx(160.0) - assert distribution_summary.cumulative_distribution_function is None - - distribution_summary_cdf = DistributionSummary.from_iterable_request_times( - requests, first_iter_times, iter_counts, first_iter_counts, include_cdf=True - ) - assert distribution_summary_cdf.cumulative_distribution_function is not None - assert len(distribution_summary_cdf.cumulative_distribution_function) == 44 - - -def test_status_distribution_summary_initialization(): - status_distribution_summary = StatusDistributionSummary( - total=create_default_distribution_summary(), - successful=create_default_distribution_summary(), - incomplete=create_default_distribution_summary(), - errored=create_default_distribution_summary(), - ) - assert status_distribution_summary.total.mean == 50.0 - assert status_distribution_summary.successful.mean == 50.0 - assert status_distribution_summary.incomplete.mean == 50.0 - assert status_distribution_summary.errored.mean == 50.0 - - -def test_status_distribution_summary_marshalling(): - status_distribution_summary = StatusDistributionSummary( - total=create_default_distribution_summary(), - successful=create_default_distribution_summary(), - incomplete=create_default_distribution_summary(), - errored=create_default_distribution_summary(), - ) - serialized = status_distribution_summary.model_dump() - deserialized = StatusDistributionSummary.model_validate(serialized) - - for key, value in vars(status_distribution_summary).items(): - for child_key, child_value in vars(value).items(): - assert getattr(getattr(deserialized, key), child_key) == child_value - - -def test_status_distribution_summary_from_values(): - value_types: list[Literal["successful", "incomplete", "error"]] = [ - "successful", - "incomplete", - "error", - ] * 1000 - values = [float(val % 3) for val in range(3000)] - status_distribution_summary = StatusDistributionSummary.from_values( - value_types, values - ) - assert status_distribution_summary.total.count == len(values) - assert status_distribution_summary.total.mean == pytest.approx(np.mean(values)) - assert status_distribution_summary.total.cumulative_distribution_function is None - assert status_distribution_summary.successful.mean == pytest.approx( - np.mean( - [val for ind, val in enumerate(values) if value_types[ind] == "successful"] + if distribution == "normal": + mean = distribution_args.get("loc", 0.0) + std_dev = distribution_args.get("scale", 1.0) + x_values = np.linspace(mean - 4 * std_dev, mean + 4 * std_dev, size) + pdf_values = (1.0 / np.sqrt(2 * np.pi * std_dev**2)) * np.exp( + -1.0 * ((x_values - mean) ** 2) / (2 * std_dev**2) ) - ) - assert status_distribution_summary.successful.count == len( - [val for ind, val in enumerate(values) if value_types[ind] == "successful"] - ) - assert ( - status_distribution_summary.successful.cumulative_distribution_function is None - ) - assert status_distribution_summary.incomplete.mean == pytest.approx( - np.mean( - [val for ind, val in enumerate(values) if value_types[ind] == "incomplete"] + elif distribution == "uniform": + low = distribution_args.get("low", 0.0) + high = distribution_args.get("high", 1.0) + x_values = np.linspace(low, high, size) + pdf_values = np.full_like(x_values, 1.0 / (high - low)) + elif distribution == "exponential": + scale = distribution_args.get("scale", 1.0) + x_values = np.linspace(0, 10 * scale, size) + pdf_values = (1 / scale) * np.exp(-x_values / scale) + elif distribution == "poisson": + lam = distribution_args.get("lam", 1.0) + x_values = np.arange(0, 20) + pdf_values = (lam**x_values * np.exp(-lam)) / np.array( + [math.factorial(x) for x in x_values] ) - ) - assert status_distribution_summary.incomplete.count == len( - [val for ind, val in enumerate(values) if value_types[ind] == "incomplete"] - ) - assert ( - status_distribution_summary.incomplete.cumulative_distribution_function is None - ) - assert status_distribution_summary.errored.mean == pytest.approx( - np.mean([val for ind, val in enumerate(values) if value_types[ind] == "error"]) - ) - assert status_distribution_summary.errored.count == len( - [val for ind, val in enumerate(values) if value_types[ind] == "error"] - ) - assert status_distribution_summary.errored.cumulative_distribution_function is None - - status_distribution_summary_cdf = StatusDistributionSummary.from_values( - value_types, values, include_cdf=True - ) - assert ( - status_distribution_summary_cdf.total.cumulative_distribution_function - is not None - ) - assert ( - status_distribution_summary_cdf.successful.cumulative_distribution_function - is not None - ) - assert ( - status_distribution_summary_cdf.incomplete.cumulative_distribution_function - is not None - ) - assert ( - status_distribution_summary_cdf.errored.cumulative_distribution_function - is not None - ) - - -def test_status_distribution_summary_from_request_times(): - request_types: list[Literal["successful", "incomplete", "error"]] = [ - "successful", - "incomplete", - "error", - ] * 1000 - requests = [((val % 3) / 10, (val % 3) / 10 + 1) for val in range(3000)] - status_distribution_summary = StatusDistributionSummary.from_request_times( - request_types, requests, distribution_type="concurrency" - ) - assert status_distribution_summary.total.mean == pytest.approx(2500.0, abs=0.01) - assert status_distribution_summary.total.cumulative_distribution_function is None - assert status_distribution_summary.successful.mean == pytest.approx( - 1000.0, abs=0.01 - ) - assert ( - status_distribution_summary.successful.cumulative_distribution_function is None - ) - assert status_distribution_summary.incomplete.mean == pytest.approx( - 1000.0, abs=0.01 - ) - assert ( - status_distribution_summary.incomplete.cumulative_distribution_function is None - ) - assert status_distribution_summary.errored.mean == pytest.approx(1000.0, abs=0.01) - assert status_distribution_summary.errored.cumulative_distribution_function is None - - status_distribution_summary_cdf = StatusDistributionSummary.from_request_times( - request_types, requests, distribution_type="concurrency", include_cdf=True - ) - assert ( - status_distribution_summary_cdf.total.cumulative_distribution_function - is not None - ) - assert ( - status_distribution_summary_cdf.successful.cumulative_distribution_function - is not None - ) - assert ( - status_distribution_summary_cdf.incomplete.cumulative_distribution_function - is not None - ) - assert ( - status_distribution_summary_cdf.errored.cumulative_distribution_function - is not None - ) + else: + raise ValueError(f"Unsupported distribution type: {distribution}") + + return np.column_stack((x_values, pdf_values / np.sum(pdf_values))) + + +@pytest.fixture( + params=[ + {"distribution": None, "distribution_args": {}}, + { + "distribution": "normal", + "distribution_args": {"loc": 5.0, "scale": 1.0}, + }, + { + "distribution": "normal", + "distribution_args": {"loc": 100.0, "scale": 15.0}, + }, + {"distribution": "uniform", "distribution_args": {"low": 3.4, "high": 9.8}}, + { + "distribution": "exponential", + "distribution_args": {"scale": 1.0}, + }, + { + "distribution": "poisson", + "distribution_args": {"lam": 5.0}, + }, + ] +) +def probability_distributions( + request, +) -> tuple[str | None, np.ndarray, np.ndarray, dict[str, float]]: + """ + Create various probability distributions for testing. + :return: A tuple containing the distribution type, the generated values, + the pdf, and the correct distribution statistics. + """ + distribution_type: str | None = request.param["distribution"] + distribution_args: dict[str, float] = request.param["distribution_args"] + + num_samples = 10000 + rng = np.random.default_rng(seed=42) + percentile_probs = { + "p001": 0.001, + "p01": 0.01, + "p05": 0.05, + "p10": 0.1, + "p25": 0.25, + "p50": 0.5, + "p75": 0.75, + "p90": 0.9, + "p95": 0.95, + "p99": 0.99, + "p999": 0.999, + } -def test_status_distribution_summary_from_iterable_request_times(): - request_types: list[Literal["successful", "incomplete", "error"]] = [ - "successful", - "incomplete", - "error", - ] * 1000 - requests = [(val % 3 / 10, val % 3 / 10 + 1) for val in range(3000)] - first_iter_times = [val % 3 / 10 + 0.1 for val in range(3000)] - iter_counts = [9 for _ in range(3000)] - first_iter_counts = [1 for _ in range(3000)] - status_distribution_summary = StatusDistributionSummary.from_iterable_request_times( - request_types, - requests, - first_iter_times, - iter_counts, - first_iter_counts, - ) - assert status_distribution_summary.total.mean == pytest.approx(21666.66, abs=0.01) - assert status_distribution_summary.total.cumulative_distribution_function is None - assert status_distribution_summary.successful.mean == pytest.approx( - 8000.0, abs=0.01 - ) - assert ( - status_distribution_summary.successful.cumulative_distribution_function is None - ) - assert status_distribution_summary.incomplete.mean == pytest.approx( - 8000.0, abs=0.01 - ) - assert ( - status_distribution_summary.incomplete.cumulative_distribution_function is None - ) - assert status_distribution_summary.errored.mean == pytest.approx(8000.0, abs=0.01) - assert status_distribution_summary.errored.cumulative_distribution_function is None + if distribution_type is None: + # Empty / 0's distribution + return ( + None, + [], + np.empty((0, 2)), + { + "mean": 0.0, + "median": 0.0, + "mode": 0.0, + "variance": 0.0, + "std_dev": 0.0, + "min": 0.0, + "max": 0.0, + "count": 0, + "total_sum": 0.0, + "percentiles": dict.fromkeys(percentile_probs.keys(), 0.0), + }, + ) - status_distribution_summary_cdf = ( - StatusDistributionSummary.from_iterable_request_times( - request_types, + rng = np.random.default_rng(seed=42) + samples = getattr(rng, distribution_type)(**distribution_args, size=num_samples) + pdf = np.column_stack( + (np.sort(samples), np.zeros_like(samples) + 1.0 / num_samples) + ) + + return ( + distribution_type, + samples, + pdf, + { + "mean": float(np.mean(samples)), + "median": float(np.median(samples)), + "variance": float(np.var(samples)), + "std_dev": float(np.std(samples)), + "min": float(np.min(samples)), + "max": float(np.max(samples)), + "count": int(len(samples)), + "total_sum": float(np.sum(samples)), + "percentiles": { + key: float(np.percentile(samples, per * 100)) + for key, per in percentile_probs.items() + }, + }, + ) + + +def concurrency_distributions( + concurrency_type: Literal[ + "sequential", + "parallel", + "constant_rate", + "burst", + "triangular_ramp", + "normal_dist", + ], + num_requests: int = 100, + start_time: float = 0.0, + end_time: float = 100.0, +) -> tuple[ + Literal["sequential", "parallel", "constant_rate", "burst", "triangular_ramp"], + np.ndarray, + dict[str, float], +]: + if concurrency_type == "sequential": + timings = np.linspace(start_time, end_time, num_requests + 1) + requests = np.column_stack((timings[:-1], timings[1:])) + + return ( + concurrency_type, requests, - first_iter_times, - iter_counts, - first_iter_counts, - include_cdf=True, + { + "start_time": None, + "end_time": None, + "mean_concurrency": 1.0, + "median_concurrency": 1.0, + "std_dev_concurrency": 0.0, + }, ) - ) - assert ( - status_distribution_summary_cdf.total.cumulative_distribution_function - is not None - ) - assert ( - status_distribution_summary_cdf.successful.cumulative_distribution_function - is not None - ) - assert ( - status_distribution_summary_cdf.incomplete.cumulative_distribution_function - is not None - ) - assert ( - status_distribution_summary_cdf.errored.cumulative_distribution_function - is not None - ) - - -def test_running_stats_initialization(): - running_stats = RunningStats() - assert running_stats.start_time == pytest.approx(time.time(), abs=0.01) - assert running_stats.count == 0 - assert running_stats.total == 0 - assert running_stats.last == 0 - assert running_stats.mean == 0 - assert running_stats.rate == 0 - - -def test_running_stats_marshalling(): - running_stats = RunningStats() - serialized = running_stats.model_dump() - deserialized = RunningStats.model_validate(serialized) - for key, value in vars(running_stats).items(): - assert getattr(deserialized, key) == value - - -def test_running_stats_update(): - running_stats = RunningStats() - running_stats.update(1) - assert running_stats.count == 1 - assert running_stats.total == 1 - assert running_stats.last == 1 - assert running_stats.mean == 1 - time.sleep(1.0) - assert running_stats.rate == pytest.approx( - 1.0 / (time.time() - running_stats.start_time), abs=0.1 - ) - - running_stats.update(2) - assert running_stats.count == 2 - assert running_stats.total == 3 - assert running_stats.last == 2 - assert running_stats.mean == 1.5 - time.sleep(1) - assert running_stats.rate == pytest.approx( - 3 / (time.time() - running_stats.start_time), abs=0.1 - ) + if concurrency_type == "parallel": + requests = np.column_stack( + (np.ones(num_requests) * start_time, np.ones(num_requests) * end_time) + ) + return ( + concurrency_type, + requests, + { + "start_time": None, + "end_time": None, + "mean_concurrency": num_requests, + "median_concurrency": num_requests, + "std_dev_concurrency": 0.0, + }, + ) -def test_running_stats_add(): - running_stats = RunningStats() - mean = running_stats + 1 - assert mean == 1 - assert mean == running_stats.mean - assert running_stats.count == 1 - assert running_stats.total == 1 - assert running_stats.last == 1 - - -def test_running_stats_iadd(): - running_stats = RunningStats() - running_stats += 1 - assert running_stats.count == 1 - assert running_stats.total == 1 - assert running_stats.last == 1 - assert running_stats.mean == 1 - - -def test_time_running_stats_initialization(): - time_running_stats = TimeRunningStats() - assert time_running_stats.start_time == pytest.approx(time.time(), abs=0.01) - assert time_running_stats.count == 0 - assert time_running_stats.total == 0 - assert time_running_stats.last == 0 - assert time_running_stats.mean == 0 - assert time_running_stats.rate == 0 - assert time_running_stats.total_ms == 0 - assert time_running_stats.last_ms == 0 - assert time_running_stats.mean_ms == 0 - assert time_running_stats.rate_ms == 0 - - -def test_time_running_stats_marshalling(): - time_running_stats = TimeRunningStats() - serialized = time_running_stats.model_dump() - deserialized = TimeRunningStats.model_validate(serialized) - - for key, value in vars(time_running_stats).items(): - assert getattr(deserialized, key) == value - - -def test_time_running_stats_update(): - time_running_stats = TimeRunningStats() - time_running_stats.update(1) - assert time_running_stats.count == 1 - assert time_running_stats.total == 1 - assert time_running_stats.last == 1 - assert time_running_stats.mean == 1 - assert time_running_stats.total_ms == 1000 - assert time_running_stats.last_ms == 1000 - assert time_running_stats.mean_ms == 1000 - time.sleep(1.0) - assert time_running_stats.rate == pytest.approx( - 1.0 / (time.time() - time_running_stats.start_time), abs=0.1 - ) - assert time_running_stats.rate_ms == pytest.approx( - 1000 / (time.time() - time_running_stats.start_time), abs=0.1 - ) + if concurrency_type == "constant_rate": + request_duration = (end_time - start_time) / 10 + timings = np.linspace(start_time, end_time - request_duration, num_requests) + requests = np.column_stack((timings, timings + request_duration)) + request_delay = timings[1] - timings[0] + rate = 1 / request_delay + concurrency = rate * request_duration - time_running_stats.update(2) - assert time_running_stats.count == 2 - assert time_running_stats.total == 3 - assert time_running_stats.last == 2 - assert time_running_stats.mean == 1.5 - assert time_running_stats.total_ms == 3000 - assert time_running_stats.last_ms == 2000 - assert time_running_stats.mean_ms == 1500 - time.sleep(1) - assert time_running_stats.rate == pytest.approx( - 3 / (time.time() - time_running_stats.start_time), abs=0.1 - ) - assert time_running_stats.rate_ms == pytest.approx( - 3000 / (time.time() - time_running_stats.start_time), abs=0.1 - ) + return ( + concurrency_type, + requests, + { + "start_time": request_delay * concurrency, + "end_time": end_time - request_delay * concurrency, + "mean_concurrency": concurrency, + "median_concurrency": concurrency, + "std_dev_concurrency": 0.0, + }, + ) + if concurrency_type == "burst": + request_length = (end_time - start_time) / 10 + requests = np.column_stack( + ( + np.repeat(start_time, num_requests), + np.repeat(start_time + request_length, num_requests), + ) + ) -@pytest.mark.regression -def test_distribution_summary_concurrency_double_counting_regression(): - """Specific regression test for the double-counting bug in concurrency calculation. + fraction_active = request_length / (end_time - start_time) + mean_concurrency_windowed = num_requests * fraction_active + median_concurrency_windowed = 0.0 if fraction_active < 0.5 else num_requests + variance = ( + fraction_active * (num_requests - mean_concurrency_windowed) ** 2 + + (1 - fraction_active) * mean_concurrency_windowed**2 + ) + std_dev_concurrency_windowed = variance**0.5 - Before the fix, when events were merged due to epsilon, the deltas were summed - but then the active count wasn't properly accumulated, causing incorrect results. + return ( + concurrency_type, + requests, + { + "start_time": start_time, + "end_time": end_time, + "mean_concurrency": mean_concurrency_windowed, + "median_concurrency": median_concurrency_windowed, + "std_dev_concurrency": std_dev_concurrency_windowed, + }, + ) - ### WRITTEN BY AI ### - """ - epsilon = 1e-6 - - # Create a scenario where multiple requests start at exactly the same time - # This should result in events being merged, testing the accumulation logic - same_start_time = 1.0 - requests = [ - (same_start_time, 3.0), - (same_start_time, 4.0), - (same_start_time, 5.0), - (same_start_time + epsilon / 3, 6.0), # Very close start (within epsilon) - ] + if concurrency_type == "triangular_ramp": + max_concurrency = num_requests + ramp_up_time = (end_time - start_time) / 2 + request_duration = ramp_up_time + timings = np.linspace(start_time, start_time + ramp_up_time, max_concurrency) + requests = np.column_stack((timings, timings + request_duration)) - distribution_summary = DistributionSummary.from_request_times( - requests, distribution_type="concurrency", epsilon=epsilon - ) + return ( + concurrency_type, + requests, + { + "start_time": None, + "end_time": None, + "mean_concurrency": max_concurrency / 2, + "median_concurrency": max_concurrency / 2, + "std_dev_concurrency": max_concurrency / (2 * math.sqrt(3)), + }, + ) - # All requests start at the same time (or within epsilon), so they should - # all be considered concurrent from the start - # Expected timeline: - # - t=1.0-3.0: 4 concurrent requests - # - t=3.0-4.0: 3 concurrent requests - # - t=4.0-5.0: 2 concurrent requests - # - t=5.0-6.0: 1 concurrent request + return None + + +class TestPercentiles: + @pytest.fixture + def valid_instances( + self, + probability_distributions: tuple[ + str | None, np.ndarray, np.ndarray, dict[str, float] + ], + ) -> tuple[Percentiles, str | None, np.ndarray, np.ndarray, dict[str, float]]: + dist_type, samples, pdf, stats = probability_distributions + instance = Percentiles( + p001=stats["percentiles"]["p001"], + p01=stats["percentiles"]["p01"], + p05=stats["percentiles"]["p05"], + p10=stats["percentiles"]["p10"], + p25=stats["percentiles"]["p25"], + p50=stats["percentiles"]["p50"], + p75=stats["percentiles"]["p75"], + p90=stats["percentiles"]["p90"], + p95=stats["percentiles"]["p95"], + p99=stats["percentiles"]["p99"], + p999=stats["percentiles"]["p999"], + ) + return instance, dist_type, samples, pdf, stats + + @pytest.mark.smoke + def test_class_signatures(self): + assert issubclass(Percentiles, BaseModel) + assert "p001" in Percentiles.model_fields + assert "p01" in Percentiles.model_fields + assert "p05" in Percentiles.model_fields + assert "p10" in Percentiles.model_fields + assert "p25" in Percentiles.model_fields + assert "p50" in Percentiles.model_fields + assert "p75" in Percentiles.model_fields + assert "p90" in Percentiles.model_fields + assert "p95" in Percentiles.model_fields + assert "p99" in Percentiles.model_fields + assert "p999" in Percentiles.model_fields + assert hasattr(Percentiles, "from_pdf") + + @pytest.mark.smoke + def test_initialization( + self, + valid_instances: tuple[ + DistributionSummary, Percentiles, np.ndarray, np.ndarray, dict[str, float] + ], + ): + instance, _dist_type, _samples, _pdf, stats = valid_instances + assert isinstance(instance, Percentiles) + assert instance.p001 == stats["percentiles"]["p001"], "p001 percentile mismatch" + assert instance.p01 == stats["percentiles"]["p01"], "p01 percentile mismatch" + assert instance.p05 == stats["percentiles"]["p05"], "p05 percentile mismatch" + assert instance.p10 == stats["percentiles"]["p10"], "p10 percentile mismatch" + assert instance.p25 == stats["percentiles"]["p25"], "p25 percentile mismatch" + assert instance.p50 == stats["percentiles"]["p50"], "p50 percentile mismatch" + assert instance.p75 == stats["percentiles"]["p75"], "p75 percentile mismatch" + assert instance.p90 == stats["percentiles"]["p90"], "p90 percentile mismatch" + assert instance.p95 == stats["percentiles"]["p95"], "p95 percentile mismatch" + assert instance.p99 == stats["percentiles"]["p99"], "p99 percentile mismatch" + assert instance.p999 == stats["percentiles"]["p999"], "p999 percentile mismatch" + + @pytest.mark.sanity + @pytest.mark.parametrize( + "missing_field", + ["p001", "p01", "p05", "p10", "p25", "p50", "p75", "p90", "p95", "p99", "p999"], + ) + def test_invalid_initialization(self, missing_field): + test_kwargs = { + "p001": 0.1, + "p01": 1.0, + "p05": 5.0, + "p10": 10.0, + "p25": 25.0, + "p50": 50.0, + "p75": 75.0, + "p90": 90.0, + "p95": 95.0, + "p99": 99.0, + "p999": 99.9, + } + del test_kwargs[missing_field] + + with pytest.raises(ValidationError): + Percentiles(**test_kwargs) + + @pytest.mark.smoke + def test_from_pdf(self, valid_instances): + _instance, _dist_type, _values, pdf, stats = valid_instances + + tolerance = 0.1 * abs(stats["std_dev"]) # within 10% of standard deviation + percentiles = Percentiles.from_pdf(pdf) + assert percentiles.p001 == pytest.approx( + stats["percentiles"]["p001"], abs=tolerance + ), "p001 percentile mismatch" + assert percentiles.p01 == pytest.approx( + stats["percentiles"]["p01"], abs=tolerance + ), "p01 percentile mismatch" + assert percentiles.p05 == pytest.approx( + stats["percentiles"]["p05"], abs=tolerance + ), "p05 percentile mismatch" + assert percentiles.p10 == pytest.approx( + stats["percentiles"]["p10"], abs=tolerance + ), "p10 percentile mismatch" + assert percentiles.p25 == pytest.approx( + stats["percentiles"]["p25"], abs=tolerance + ), "p25 percentile mismatch" + assert percentiles.p50 == pytest.approx( + stats["percentiles"]["p50"], abs=tolerance + ), "p50 percentile mismatch" + assert percentiles.p75 == pytest.approx( + stats["percentiles"]["p75"], abs=tolerance + ), "p75 percentile mismatch" + assert percentiles.p90 == pytest.approx( + stats["percentiles"]["p90"], abs=tolerance + ), "p90 percentile mismatch" + assert percentiles.p95 == pytest.approx( + stats["percentiles"]["p95"], abs=tolerance + ), "p95 percentile mismatch" + assert percentiles.p99 == pytest.approx( + stats["percentiles"]["p99"], abs=tolerance + ), "p99 percentile mismatch" + assert percentiles.p999 == pytest.approx( + stats["percentiles"]["p999"], abs=(tolerance * 2) + ), "p999 percentile mismatch" + + @pytest.mark.sanity + @pytest.mark.parametrize( + ("pdf", "error_match"), + [ + (np.array([1, 2, 3]), "must be a 2D array"), + (np.array([[1, 2, 3]]), "must be a 2D array"), + (np.array([[1.0, -0.5], [2.0, 0.5]]), "must be non-negative"), + (np.array([[1.0, 0.3], [2.0, 0.5]]), "must sum to 1"), + ], + ) + def test_from_pdf_invalid(self, pdf, error_match): + with pytest.raises(ValueError, match=error_match): + Percentiles.from_pdf(pdf) + + @pytest.mark.smoke + def test_marshalling(self, valid_instances): + instance, _dist_type, _values, _pdf, stats = valid_instances + data_dict = instance.model_dump() + assert isinstance(data_dict, dict) + for param in stats["percentiles"]: + assert param in data_dict + assert data_dict[param] == getattr(instance, param) + + recreated = Percentiles.model_validate(data_dict) + assert isinstance(recreated, Percentiles) + for param in stats["percentiles"]: + assert getattr(recreated, param) == getattr(instance, param) + + +class TestDistributionSummary: + @pytest.fixture + def valid_instances( + self, + probability_distributions: tuple[ + str | None, np.ndarray, np.ndarray, dict[str, float] + ], + ) -> tuple[ + DistributionSummary, str | None, np.ndarray, np.ndarray, dict[str, float] + ]: + dist_type, samples, pdf, stats = probability_distributions + instance = DistributionSummary( + mean=stats["mean"], + median=stats["median"], + mode=0.0, + variance=stats["variance"], + std_dev=stats["std_dev"], + min=stats["min"], + max=stats["max"], + count=stats["count"], + total_sum=stats["total_sum"], + percentiles=Percentiles(**stats["percentiles"]), + pdf=pdf, + ) - assert distribution_summary.max == 4.0 # All 4 requests concurrent at start - assert distribution_summary.min == 1.0 # 1 request still running at the end + return instance, dist_type, samples, pdf, stats + + @pytest.mark.smoke + def test_class_signatures(self): + assert issubclass(DistributionSummary, BaseModel) + assert "mean" in DistributionSummary.model_fields + assert "median" in DistributionSummary.model_fields + assert "mode" in DistributionSummary.model_fields + assert "variance" in DistributionSummary.model_fields + assert "std_dev" in DistributionSummary.model_fields + assert "min" in DistributionSummary.model_fields + assert "max" in DistributionSummary.model_fields + assert "count" in DistributionSummary.model_fields + assert "total_sum" in DistributionSummary.model_fields + assert "percentiles" in DistributionSummary.model_fields + assert "pdf" in DistributionSummary.model_fields + assert hasattr(DistributionSummary, "from_pdf") + assert hasattr(DistributionSummary, "from_values") + assert hasattr(DistributionSummary, "rate_distribution_from_timings") + assert hasattr(DistributionSummary, "concurrency_distribution_from_timings") + + @pytest.mark.smoke + def test_initialization( + self, + valid_instances: tuple[ + DistributionSummary, str | None, np.ndarray, np.ndarray, dict[str, float] + ], + ): + instance, _dist_type, _samples, _pdf, stats = valid_instances + assert instance.mean == stats["mean"] + assert instance.median == stats["median"] + assert instance.variance == stats["variance"] + assert instance.std_dev == stats["std_dev"] + assert instance.min == stats["min"] + assert instance.max == stats["max"] + assert instance.count == stats["count"] + assert instance.total_sum == stats["total_sum"] + assert isinstance(instance.percentiles, Percentiles) + for param in stats["percentiles"]: + assert getattr(instance.percentiles, param) == stats["percentiles"][param] + assert instance.pdf is None or isinstance(instance.pdf, list) + + @pytest.mark.sanity + @pytest.mark.parametrize( + "missing_field", + [ + "mean", + "median", + "mode", + "variance", + "std_dev", + "min", + "max", + "count", + "total_sum", + "percentiles", + ], + ) + def test_invalid_initialization(self, missing_field): + test_kwargs = { + "mean": 50.0, + "median": 50.0, + "mode": 50.0, + "variance": 835.0, + "std_dev": math.sqrt(835.0), + "min": 0.0, + "max": 100.0, + "count": 1001, + "total_sum": 50050.0, + "percentiles": Percentiles( + p001=0.1, + p01=1.0, + p05=5.0, + p10=10.0, + p25=25.0, + p50=50.0, + p75=75.0, + p90=90.0, + p95=95.0, + p99=99.0, + p999=99.9, + ), + } + del test_kwargs[missing_field] + + with pytest.raises(ValidationError): + DistributionSummary(**test_kwargs) + + @pytest.mark.smoke + @pytest.mark.parametrize("include_pdf", [False, True]) + def test_from_pdf( + self, + valid_instances: tuple[ + DistributionSummary, str | None, np.ndarray, np.ndarray, dict[str, float] + ], + include_pdf: bool | int, + ): + _instance, _dist_type, values, pdf, stats = valid_instances + + tolerance = 0.1 * abs(stats["std_dev"]) # within 10% of standard deviation + summary = DistributionSummary.from_pdf(pdf, include_pdf=include_pdf) + assert summary.mean == pytest.approx(stats["mean"], abs=tolerance), ( + "mean mismatch" + ) + assert summary.median == pytest.approx(stats["median"], abs=tolerance), ( + "median mismatch" + ) + assert summary.variance == pytest.approx(stats["variance"], abs=tolerance), ( + "variance mismatch" + ) + assert summary.std_dev == pytest.approx(stats["std_dev"], abs=tolerance), ( + "std_dev mismatch" + ) + assert summary.min == pytest.approx(stats["min"], abs=tolerance), "min mismatch" + assert summary.max == pytest.approx(stats["max"], abs=tolerance), "max mismatch" + assert summary.count == stats["count"], "count mismatch" + assert summary.total_sum == pytest.approx(stats["total_sum"], abs=tolerance), ( + "total_sum mismatch" + ) + assert isinstance(summary.percentiles, Percentiles) + for param in stats["percentiles"]: + assert getattr(summary.percentiles, param) == pytest.approx( + stats["percentiles"][param], + abs=tolerance if param != "p999" else (tolerance * 2), + ), f"{param} percentile mismatch" + + if include_pdf is False: + assert summary.pdf is None + elif include_pdf is True: + assert summary.pdf is not None + assert isinstance(summary.pdf, list) + assert len(summary.pdf) == len(pdf) + + @pytest.mark.smoke + @pytest.mark.parametrize("include_pdf", [False, True]) + def test_from_values( + self, + valid_instances: tuple[ + DistributionSummary, str | None, np.ndarray, np.ndarray, dict[str, float] + ], + include_pdf: bool | int, + ): + _instance, _dist_type, values, _pdf, stats = valid_instances + + tolerance = 0.1 * abs(stats["std_dev"]) # within 10% of standard deviation + summary = DistributionSummary.from_values(values, include_pdf=include_pdf) + assert summary.mean == pytest.approx(stats["mean"], abs=tolerance), ( + "mean mismatch" + ) + assert summary.median == pytest.approx(stats["median"], abs=tolerance), ( + "median mismatch" + ) + assert summary.variance == pytest.approx(stats["variance"], abs=tolerance), ( + "variance mismatch" + ) + assert summary.std_dev == pytest.approx(stats["std_dev"], abs=tolerance), ( + "std_dev mismatch" + ) + assert summary.min == pytest.approx(stats["min"], abs=tolerance), "min mismatch" + assert summary.max == pytest.approx(stats["max"], abs=tolerance), "max mismatch" + assert summary.count == stats["count"], "count mismatch" + assert summary.total_sum == pytest.approx(stats["total_sum"], abs=tolerance), ( + "total_sum mismatch" + ) + assert isinstance(summary.percentiles, Percentiles) + for param in stats["percentiles"]: + assert getattr(summary.percentiles, param) == pytest.approx( + stats["percentiles"][param], + abs=tolerance if param != "p999" else (tolerance * 2), + ), f"{param} percentile mismatch" + + if include_pdf is False: + assert summary.pdf is None + elif include_pdf is True: + assert summary.pdf is not None + assert isinstance(summary.pdf, list) + assert len(summary.pdf) > 0 if len(values) > 0 else len(summary.pdf) == 0 + + @pytest.mark.smoke + @pytest.mark.parametrize( + ("limit_start_time", "limit_end_time", "include_pdf"), + [ + (False, False, False), + (True, False, True), + (False, True, False), + (True, True, True), + ], + ) + def test_rate_distribution_from_timings( + self, + valid_instances: tuple[ + DistributionSummary, str | None, np.ndarray, np.ndarray, dict[str, float] + ], + limit_start_time: bool, + limit_end_time: bool, + include_pdf: bool | int, + ): + _instance, dist_type, _values, pdf, stats = valid_instances + + if dist_type in ("exponential", "poisson"): + pytest.skip( + f"Skipping rate distribution test for {dist_type} distribution " + "due to inherent variability and incompatibility with rate assumptions." + ) + + rng = np.random.default_rng(seed=42) + + if len(pdf) > 0: + # The PDF gives the expected distribution for the rates + # So, we can use it to sample individual, instantaneous rates + # and convert those to timings by inverting and accumulating + sampled_rates = rng.choice(pdf[:, 0], size=100000, p=pdf[:, 1]) + delta_times = 1.0 / np.clip(sampled_rates, a_min=1e-6, a_max=None) + timings = np.cumsum(delta_times) + else: + timings = np.array([]) + + # Now, compute the rate distribution from the timings and compare + start_time = stats["mean"] if limit_start_time and len(timings) > 0 else None + end_time = ( + np.max(timings) - stats["mean"] + if limit_end_time and len(timings) > 0 + else None + ) + distribution = DistributionSummary.rate_distribution_from_timings( + timings, start_time=start_time, end_time=end_time, include_pdf=include_pdf + ) + # Check expected nearly exact values (mean and count) + expected_rate = ( + len(timings) / (timings[-1] - timings[0]) if len(timings) > 1 else 0.0 + ) + assert distribution.mean == pytest.approx(expected_rate, rel=10e-4), ( + "expected mean rate mismatch" + ) + expected_count = len(timings) + if start_time and len(timings) > 0: + expected_count -= len(timings[timings < start_time]) + if end_time and len(timings) > 0: + expected_count -= len(timings[timings > end_time]) + assert distribution.count == expected_count, "expected count mismatch" + + # Loosely validate against original stats (randomness in sampling) + tolerance = 0.5 * abs(stats["std_dev"]) # within 10% of standard deviation + assert distribution.mean == pytest.approx(stats["mean"], abs=tolerance), ( + "mean mismatch" + ) + assert distribution.median == pytest.approx(stats["median"], abs=tolerance), ( + "median mismatch" + ) + assert distribution.std_dev == pytest.approx(stats["std_dev"], abs=tolerance), ( + "std_dev mismatch" + ) -@pytest.mark.sanity -def test_distribution_summary_concurrency_epsilon_edge_case(): - """Test the exact epsilon boundary condition. + @pytest.mark.smoke + @pytest.mark.parametrize( + ("concurrency_type", "include_pdf"), + [ + ("sequential", False), + ("parallel", True), + ("constant_rate", False), + ("burst", True), + ("triangular_ramp", False), + ], + ) + def test_concurrency_distribution_from_timings(self, concurrency_type, include_pdf): + ( + _concurrency_type, + requests, + stats, + ) = concurrency_distributions(concurrency_type, num_requests=1000) - ### WRITTEN BY AI ### - """ - epsilon = 1e-6 + distribution = DistributionSummary.concurrency_distribution_from_timings( + requests, + start_time=stats["start_time"], + end_time=stats["end_time"], + include_pdf=include_pdf, + ) - # Test requests that are exactly epsilon apart - should be merged - requests_exactly_epsilon = [ - (1.0, 2.0), - (1.0 + epsilon, 2.5), # Exactly epsilon apart - (2.0, 2.5), # Another close request - ] + assert distribution.mean == pytest.approx( + stats["mean_concurrency"], rel=1e-2 + ), "mean concurrency mismatch" + assert distribution.median == pytest.approx( + stats["median_concurrency"], rel=1e-2 + ), "median concurrency mismatch" + assert distribution.std_dev == pytest.approx( + stats["std_dev_concurrency"], rel=1e-2 + ), "std_dev concurrency mismatch" + + @pytest.mark.smoke + def test_marshalling(self, valid_instances): + instance, _dist_type, _values, _pdf, stats = valid_instances + data_dict = instance.model_dump() + assert isinstance(data_dict, dict) + for param in [ + "mean", + "median", + "mode", + "variance", + "std_dev", + "min", + "max", + "count", + "total_sum", + "percentiles", + "pdf", + ]: + assert param in data_dict + if param == "percentiles": + for p_param in stats["percentiles"]: + assert ( + getattr(instance.percentiles, p_param) + == data_dict["percentiles"][p_param] + ) + else: + assert data_dict[param] == getattr(instance, param) + + recreated = DistributionSummary.model_validate(data_dict) + assert isinstance(recreated, DistributionSummary) + for param in [ + "mean", + "median", + "mode", + "variance", + "std_dev", + "min", + "max", + "count", + "total_sum", + "percentiles", + "pdf", + ]: + if param == "percentiles": + for p_param in stats["percentiles"]: + assert getattr(recreated.percentiles, p_param) == getattr( + instance.percentiles, p_param + ) + else: + assert getattr(recreated, param) == getattr(instance, param) + + @pytest.mark.sanity + @pytest.mark.parametrize( + ("values", "error_type"), + [ + ("not_a_list", ValueError), + ({"invalid": "dict"}, ValueError), + (None, ValueError), + ], + ) + def test_from_values_invalid_input(self, values, error_type): + """Test DistributionSummary.from_values with invalid input types.""" + with pytest.raises(error_type): + DistributionSummary.from_values(values) + + @pytest.mark.sanity + @pytest.mark.parametrize( + ("pdf", "error_match"), + [ + (np.array([1, 2, 3]), "must be a 2D array"), + (np.array([[1, 2, 3]]), "must be a 2D array"), + (np.array([[1.0, -0.5], [2.0, 0.5]]), "must be non-negative"), + (np.array([[1.0, 0.3], [2.0, 0.5]]), "must sum to 1"), + ], + ) + def test_from_pdf_invalid(self, pdf, error_match): + """Test DistributionSummary.from_pdf with invalid PDFs.""" + with pytest.raises(ValueError, match=error_match): + DistributionSummary.from_pdf(pdf) + + @pytest.mark.sanity + def test_from_values_with_weights(self): + """Test DistributionSummary.from_values with weighted values.""" + # Values with weights: (value, weight) + values = [(1.0, 2.0), (2.0, 1.0), (3.0, 1.0)] + summary = DistributionSummary.from_values(values) + + assert isinstance(summary, DistributionSummary) + # Count is sum of weights: 2 + 1 + 1 = 4 + assert summary.count == 4 + # Mean should be weighted: (1*2 + 2*1 + 3*1) / (2+1+1) = 7/4 = 1.75 + assert summary.mean == pytest.approx(1.75, abs=0.01) + + @pytest.mark.sanity + def test_rate_distribution_empty_timings(self): + """Test rate_distribution_from_timings with empty input.""" + summary = DistributionSummary.rate_distribution_from_timings([]) + assert summary.count == 0 + assert summary.mean == 0.0 + + @pytest.mark.sanity + def test_concurrency_distribution_empty_intervals(self): + """Test concurrency_distribution_from_timings with empty input.""" + summary = DistributionSummary.concurrency_distribution_from_timings([]) + assert summary.count == 0 + assert summary.mean == 0.0 + + @pytest.mark.sanity + def test_rate_distribution_single_event(self): + """Test rate_distribution_from_timings with single event.""" + summary = DistributionSummary.rate_distribution_from_timings([1.0]) + # Single event results in no rates (need at least 2 for intervals) + assert summary.count == 0 + assert summary.mean == 0.0 + + @pytest.mark.sanity + def test_concurrency_with_weighted_intervals(self): + """Test concurrency_distribution_from_timings with weighted intervals.""" + # Intervals with weights: (start, end, weight) + intervals = [(0.0, 10.0, 2.0), (5.0, 15.0, 1.0)] + summary = DistributionSummary.concurrency_distribution_from_timings(intervals) + + assert isinstance(summary, DistributionSummary) + assert summary.count == 2 + + +class TestStatusDistributionSummary: + @pytest.fixture( + params=[ + { + "successful": [1.0, 2.0, 3.0], + "incomplete": [4.0, 5.0], + "errored": [6.0], + }, + { + "successful": np.array([10.0, 20.0, 30.0, 40.0]), + "incomplete": np.array([50.0]), + "errored": np.array([]), + }, + { + "successful": [], + "incomplete": [], + "errored": [], + }, + ] + ) + def valid_instances( + self, + request, + ) -> tuple[StatusDistributionSummary, dict[str, list[float] | np.ndarray]]: + """Fixture providing test data for StatusDistributionSummary.""" + test_data = request.param + instance = StatusDistributionSummary.from_values( + successful=test_data["successful"], + incomplete=test_data["incomplete"], + errored=test_data["errored"], + ) + return instance, test_data + + @pytest.mark.smoke + def test_class_signatures(self): + """Test StatusDistributionSummary class structure and methods.""" + assert hasattr(StatusDistributionSummary, "from_values") + assert hasattr(StatusDistributionSummary, "rate_distribution_from_timings") + assert hasattr( + StatusDistributionSummary, "concurrency_distribution_from_timings" + ) + assert "total" in StatusDistributionSummary.model_fields + assert "successful" in StatusDistributionSummary.model_fields + assert "incomplete" in StatusDistributionSummary.model_fields + assert "errored" in StatusDistributionSummary.model_fields + + @pytest.mark.smoke + def test_initialization( + self, + valid_instances: tuple[ + StatusDistributionSummary, dict[str, list[float] | np.ndarray] + ], + ): + """Test StatusDistributionSummary initialization.""" + instance, test_data = valid_instances + assert isinstance(instance, StatusDistributionSummary) + assert isinstance(instance.total, DistributionSummary) + assert isinstance(instance.successful, DistributionSummary) + assert isinstance(instance.incomplete, DistributionSummary) + assert isinstance(instance.errored, DistributionSummary) + + # Verify counts match expected + successful_count = ( + len(test_data["successful"]) + if isinstance(test_data["successful"], list) + else test_data["successful"].shape[0] + ) + incomplete_count = ( + len(test_data["incomplete"]) + if isinstance(test_data["incomplete"], list) + else test_data["incomplete"].shape[0] + ) + errored_count = ( + len(test_data["errored"]) + if isinstance(test_data["errored"], list) + else test_data["errored"].shape[0] + ) - dist_epsilon = DistributionSummary.from_request_times( - requests_exactly_epsilon, distribution_type="concurrency", epsilon=epsilon - ) + assert instance.successful.count == successful_count + assert instance.incomplete.count == incomplete_count + assert instance.errored.count == errored_count + assert ( + instance.total.count == successful_count + incomplete_count + errored_count + ) - # Should be treated as concurrent (merged events) - assert dist_epsilon.max == 2.0 - assert dist_epsilon.min == 2.0 + @pytest.mark.sanity + @pytest.mark.parametrize( + ("field", "value"), + [ + ("successful", "invalid_string"), + ("incomplete", 123), + ("errored", [1, 2, 3]), + ("total", {"dict": "value"}), + ], + ) + def test_invalid_initialization(self, field, value): + """Test StatusDistributionSummary with invalid field types.""" + test_kwargs = { + "successful": DistributionSummary.from_values([1.0, 2.0]), + "incomplete": DistributionSummary.from_values([3.0]), + "errored": DistributionSummary.from_values([]), + "total": DistributionSummary.from_values([1.0, 2.0, 3.0]), + } + test_kwargs[field] = value + + with pytest.raises(ValidationError): + StatusDistributionSummary(**test_kwargs) + + @pytest.mark.smoke + @pytest.mark.parametrize("include_pdf", [False, True]) + def test_from_values( + self, + valid_instances: tuple[ + StatusDistributionSummary, dict[str, list[float] | np.ndarray] + ], + include_pdf: bool | int, + ): + """Test creating StatusDistributionSummary from values.""" + _instance, test_data = valid_instances + + summary = StatusDistributionSummary.from_values( + successful=test_data["successful"], + incomplete=test_data["incomplete"], + errored=test_data["errored"], + include_pdf=include_pdf, + ) - # Test requests that are just over epsilon apart - should NOT be merged - requests_over_epsilon = [ - (1.0, 2.0), - (1.0 + epsilon * 1.1, 2.5), # Just over epsilon apart - (2.0, 2.5), # Another close request - ] + assert isinstance(summary, StatusDistributionSummary) + assert isinstance(summary.total, DistributionSummary) + assert isinstance(summary.successful, DistributionSummary) + assert isinstance(summary.incomplete, DistributionSummary) + assert isinstance(summary.errored, DistributionSummary) + + if include_pdf is False: + assert summary.total.pdf is None + assert summary.successful.pdf is None + assert summary.incomplete.pdf is None + assert summary.errored.pdf is None + elif include_pdf is True: + assert summary.total.pdf is not None or summary.total.count == 0 + assert summary.successful.pdf is not None or summary.successful.count == 0 + assert summary.incomplete.pdf is not None or summary.incomplete.count == 0 + assert summary.errored.pdf is not None or summary.errored.count == 0 + + @pytest.mark.smoke + @pytest.mark.parametrize( + ("limit_start_time", "limit_end_time", "include_pdf"), + [ + (False, False, False), + (True, False, True), + (False, True, False), + (True, True, True), + ], + ) + def test_rate_distribution_from_timings( + self, + limit_start_time: bool, + limit_end_time: bool, + include_pdf: bool | int, + ): + """Test creating rate distribution from timings by status.""" + rng = np.random.default_rng(seed=42) + successful_times = rng.uniform(0, 100, 50).tolist() + incomplete_times = rng.uniform(0, 100, 20).tolist() + errored_times = rng.uniform(0, 100, 10).tolist() + + start_time = 25.0 if limit_start_time else None + end_time = 75.0 if limit_end_time else None + + summary = StatusDistributionSummary.rate_distribution_from_timings( + successful=successful_times, + incomplete=incomplete_times, + errored=errored_times, + start_time=start_time, + end_time=end_time, + include_pdf=include_pdf, + ) - dist_over_epsilon = DistributionSummary.from_request_times( - requests_over_epsilon, distribution_type="concurrency", epsilon=epsilon - ) + assert isinstance(summary, StatusDistributionSummary) + assert isinstance(summary.total, DistributionSummary) + assert isinstance(summary.successful, DistributionSummary) + assert isinstance(summary.incomplete, DistributionSummary) + assert isinstance(summary.errored, DistributionSummary) + + # Verify counts are reasonable + assert summary.total.count >= 0 + assert summary.successful.count >= 0 + assert summary.incomplete.count >= 0 + assert summary.errored.count >= 0 + + @pytest.mark.smoke + @pytest.mark.parametrize( + "include_pdf", + [ + False, + True, + ], + ) + def test_concurrency_distribution_from_timings(self, include_pdf: bool | int): + """Test creating concurrency distribution from intervals by status.""" + rng = np.random.default_rng(seed=42) + num_successful = 30 + num_incomplete = 10 + num_errored = 5 + + # Generate realistic intervals (start, end) + successful_starts = rng.uniform(0, 80, num_successful) + successful_intervals = [ + (start, start + rng.uniform(1, 20)) for start in successful_starts + ] + + incomplete_starts = rng.uniform(0, 80, num_incomplete) + incomplete_intervals = [ + (start, start + rng.uniform(1, 20)) for start in incomplete_starts + ] + + errored_starts = rng.uniform(0, 80, num_errored) + errored_intervals = [ + (start, start + rng.uniform(1, 20)) for start in errored_starts + ] + + summary = StatusDistributionSummary.concurrency_distribution_from_timings( + successful=successful_intervals, + incomplete=incomplete_intervals, + errored=errored_intervals, + include_pdf=include_pdf, + ) - # These should be treated separately, so max concurrency depends on overlap - # At t=1.0 to 1.0+epsilon*1.1: 1 concurrent - # At t=1.0+epsilon*1.1 to 2.0: 2 concurrent - # At t=2.0 to 2.5: 1 concurrent - assert dist_over_epsilon.max == 2.0 - assert dist_over_epsilon.min == 1.0 + assert isinstance(summary, StatusDistributionSummary) + assert isinstance(summary.total, DistributionSummary) + assert isinstance(summary.successful, DistributionSummary) + assert isinstance(summary.incomplete, DistributionSummary) + assert isinstance(summary.errored, DistributionSummary) + + # Verify counts match + assert summary.successful.count == num_successful + assert summary.incomplete.count == num_incomplete + assert summary.errored.count == num_errored + assert summary.total.count == num_successful + num_incomplete + num_errored + + @pytest.mark.smoke + def test_marshalling(self, valid_instances): + """Test StatusDistributionSummary serialization and deserialization.""" + instance, _test_data = valid_instances + data_dict = instance.model_dump() + assert isinstance(data_dict, dict) + assert "total" in data_dict + assert "successful" in data_dict + assert "incomplete" in data_dict + assert "errored" in data_dict + + # Verify each status has distribution summary data + for status in ["total", "successful", "incomplete", "errored"]: + assert isinstance(data_dict[status], dict) + assert "mean" in data_dict[status] + assert "median" in data_dict[status] + assert "count" in data_dict[status] + + recreated = StatusDistributionSummary.model_validate(data_dict) + assert isinstance(recreated, StatusDistributionSummary) + assert recreated.total.count == instance.total.count + assert recreated.successful.count == instance.successful.count + assert recreated.incomplete.count == instance.incomplete.count + assert recreated.errored.count == instance.errored.count