diff --git a/pylock.toml b/pylock.toml
index c4a1545d..bbbf866a 100644
--- a/pylock.toml
+++ b/pylock.toml
@@ -167,64 +167,13 @@ dependencies = [
     "importlib-metadata; python_version < \"3.8\"",
 ]
 
-[[packages]]
-name = "setuptools-git-versioning"
-version = "2.1.0"
-requires-python = ">=3.7"
-sdist = {name = "setuptools_git_versioning-2.1.0.tar.gz", url = "https://files.pythonhosted.org/packages/f0/72/507b0b459b1fdbf5705aecbc5330c32d62dd41560718d2720bb6d94607f5/setuptools_git_versioning-2.1.0.tar.gz", hashes = {sha256 = "6aef5b8bb1cfb953b6b343d27cbfc561d96cf2a2ee23c2e0dd3591042a059921"}}
-wheels = [
-    {name = "setuptools_git_versioning-2.1.0-py3-none-any.whl",url = "https://files.pythonhosted.org/packages/c0/ba/daf16c2d1965bf6237fb696639e3e93645ac6801f7dcaf9ec694a74e9326/setuptools_git_versioning-2.1.0-py3-none-any.whl",hashes = {sha256 = "09a15cbb9a00884e91a3591a4c9ec1ff93c24b1b4a40de39a44815196beb7ebf"}},
-]
-marker = "\"dev\" in extras"
-
-[packages.tool.pdm]
-dependencies = [
-    "packaging",
-    "setuptools",
-    "tomli>=2.0.1; python_version < \"3.11\"",
-]
-
-[[packages]]
-name = "build"
-version = "1.3.0"
-requires-python = ">=3.9"
-sdist = {name = "build-1.3.0.tar.gz", url = "https://files.pythonhosted.org/packages/25/1c/23e33405a7c9eac261dff640926b8b5adaed6a6eb3e1767d441ed611d0c0/build-1.3.0.tar.gz", hashes = {sha256 = "698edd0ea270bde950f53aed21f3a0135672206f3911e0176261a31e0e07b397"}}
-wheels = [
-    {name = "build-1.3.0-py3-none-any.whl",url = "https://files.pythonhosted.org/packages/cb/8c/2b30c12155ad8de0cf641d76a8b396a16d2c36bc6d50b621a62b7c4567c1/build-1.3.0-py3-none-any.whl",hashes = {sha256 = "7145f0b5061ba90a1500d60bd1b13ca0a8a4cebdd0cc16ed8adf1c0e739f43b4"}},
-]
-marker = "\"dev\" in extras"
-
-[packages.tool.pdm]
-dependencies = [
-    "packaging>=19.1",
-    "pyproject-hooks",
-    "colorama; os_name == \"nt\"",
-    "importlib-metadata>=4.6; python_full_version < \"3.10.2\"",
-    "tomli>=1.1.0; python_version < \"3.11\"",
-]
-
-[[packages]]
-name = "culsans"
-version = "0.9.0"
-requires-python = ">=3.8"
-sdist = {name = "culsans-0.9.0.tar.gz", url = "https://files.pythonhosted.org/packages/90/5d/12e7e16b0caafaa8cca0728dd817204afd1274ddb35531b029b1c5cf7b2a/culsans-0.9.0.tar.gz", hashes = {sha256 = "942dd3c3c77f20e9ac3383d9a5ef8b7b24c0dac1a593bdb20d46c8a38720a5f3"}}
-wheels = [
-    {name = "culsans-0.9.0-py3-none-any.whl",url = "https://files.pythonhosted.org/packages/6f/b4/1e3cccb48f09e89e0cfc06925182cbcd36abf80b8eda2489430b41c7eaff/culsans-0.9.0-py3-none-any.whl",hashes = {sha256 = "d3537b65bbb341c2ac72e7d152deb8ab893b2a00452d2a68702a1a1a41619d6f"}},
-]
-marker = "\"default\" in dependency_groups"
-
-[packages.tool.pdm]
-dependencies = [
-    "aiologic>=0.13.0",
-]
-
 [[packages]]
 name = "datasets"
-version = "4.2.0"
+version = "4.4.0"
 requires-python = ">=3.9.0"
-sdist = {name = "datasets-4.2.0.tar.gz", url = "https://files.pythonhosted.org/packages/70/48/0186fbc4b86a4f9ecaf04eb01e877e78b53bfa0b03be9c84b2298431ba33/datasets-4.2.0.tar.gz", hashes = {sha256 = "8333a7db9f3bb8044c1b819a35d4e3e2809596c837793b0921382efffdc36e78"}}
+sdist = {name = "datasets-4.4.0.tar.gz", url = "https://files.pythonhosted.org/packages/57/13/f05a80bbbac5f62e492e5e463ec59a4479647ef9c376b1fdfaa4d3ed01cc/datasets-4.4.0.tar.gz", hashes = {sha256 = "0430d39b9f13b53c37afb80c23c7e5d8c6ceccc014c14a14d15fa2b4e8688d2a"}}
 wheels = [
-    {name = "datasets-4.2.0-py3-none-any.whl",url = "https://files.pythonhosted.org/packages/91/9e/0bbbd09b116fd8ee2d3617e28e6598551d2f0f24d3a2ce99cc87ec85aeb0/datasets-4.2.0-py3-none-any.whl",hashes = {sha256 = "fdc43aaf4a73b31f64f80f72f195ab413a1141ed15555d675b2fd17926f8b026"}},
+    {name = "datasets-4.4.0-py3-none-any.whl",url = "https://files.pythonhosted.org/packages/1e/31/d552336985f747b19f0a852d98ca7a2ef4727ba956b38041cfbda08dde0a/datasets-4.4.0-py3-none-any.whl",hashes = {sha256 = "b7e6d1d48c2e1d3a95d6b378e8fc3d7ab29f24f14ddf505a8d417dd09c692f19"}},
 ]
 marker = "\"default\" in dependency_groups or \"all\" in extras or \"audio\" in extras or \"dev\" in extras or \"vision\" in extras"
 
@@ -239,8 +188,8 @@ dependencies = [
     "httpx<1.0.0",
     "tqdm>=4.66.3",
     "xxhash",
-    "multiprocess<0.70.17",
-    "fsspec[http]<=2025.9.0,>=2023.1.0",
+    "multiprocess<0.70.19",
+    "fsspec[http]<=2025.10.0,>=2023.1.0",
     "huggingface-hub<2.0,>=0.25.0",
     "packaging",
     "pyyaml>=5.1",
@@ -381,6 +330,57 @@ marker = "\"default\" in dependency_groups or \"all\" in extras or \"audio\" in
 [packages.tool.pdm]
 dependencies = []
 
+[[packages]]
+name = "setuptools-git-versioning"
+version = "2.1.0"
+requires-python = ">=3.7"
+sdist = {name = "setuptools_git_versioning-2.1.0.tar.gz", url = "https://files.pythonhosted.org/packages/f0/72/507b0b459b1fdbf5705aecbc5330c32d62dd41560718d2720bb6d94607f5/setuptools_git_versioning-2.1.0.tar.gz", hashes = {sha256 = "6aef5b8bb1cfb953b6b343d27cbfc561d96cf2a2ee23c2e0dd3591042a059921"}}
+wheels = [
+    {name = "setuptools_git_versioning-2.1.0-py3-none-any.whl",url = "https://files.pythonhosted.org/packages/c0/ba/daf16c2d1965bf6237fb696639e3e93645ac6801f7dcaf9ec694a74e9326/setuptools_git_versioning-2.1.0-py3-none-any.whl",hashes = {sha256 = "09a15cbb9a00884e91a3591a4c9ec1ff93c24b1b4a40de39a44815196beb7ebf"}},
+]
+marker = "\"dev\" in extras"
+
+[packages.tool.pdm]
+dependencies = [
+    "packaging",
+    "setuptools",
+    "tomli>=2.0.1; python_version < \"3.11\"",
+]
+
+[[packages]]
+name = "build"
+version = "1.3.0"
+requires-python = ">=3.9"
+sdist = {name = "build-1.3.0.tar.gz", url = "https://files.pythonhosted.org/packages/25/1c/23e33405a7c9eac261dff640926b8b5adaed6a6eb3e1767d441ed611d0c0/build-1.3.0.tar.gz", hashes = {sha256 = "698edd0ea270bde950f53aed21f3a0135672206f3911e0176261a31e0e07b397"}}
+wheels = [
+    {name = "build-1.3.0-py3-none-any.whl",url = "https://files.pythonhosted.org/packages/cb/8c/2b30c12155ad8de0cf641d76a8b396a16d2c36bc6d50b621a62b7c4567c1/build-1.3.0-py3-none-any.whl",hashes = {sha256 = "7145f0b5061ba90a1500d60bd1b13ca0a8a4cebdd0cc16ed8adf1c0e739f43b4"}},
+]
+marker = "\"dev\" in extras"
+
+[packages.tool.pdm]
+dependencies = [
+    "packaging>=19.1",
+    "pyproject-hooks",
+    "colorama; os_name == \"nt\"",
+    "importlib-metadata>=4.6; python_full_version < \"3.10.2\"",
+    "tomli>=1.1.0; python_version < \"3.11\"",
+]
+
+[[packages]]
+name = "culsans"
+version = "0.9.0"
+requires-python = ">=3.8"
+sdist = {name = "culsans-0.9.0.tar.gz", url = "https://files.pythonhosted.org/packages/90/5d/12e7e16b0caafaa8cca0728dd817204afd1274ddb35531b029b1c5cf7b2a/culsans-0.9.0.tar.gz", hashes = {sha256 = "942dd3c3c77f20e9ac3383d9a5ef8b7b24c0dac1a593bdb20d46c8a38720a5f3"}}
+wheels = [
+    {name = "culsans-0.9.0-py3-none-any.whl",url = "https://files.pythonhosted.org/packages/6f/b4/1e3cccb48f09e89e0cfc06925182cbcd36abf80b8eda2489430b41c7eaff/culsans-0.9.0-py3-none-any.whl",hashes = {sha256 = "d3537b65bbb341c2ac72e7d152deb8ab893b2a00452d2a68702a1a1a41619d6f"}},
+]
+marker = "\"default\" in dependency_groups"
+
+[packages.tool.pdm]
+dependencies = [
+    "aiologic>=0.13.0",
+]
+
 [[packages]]
 name = "ftfy"
 version = "6.3.1"
@@ -1247,6 +1247,22 @@ marker = "\"all\" in extras or \"dev\" in extras or \"perf\" in extras or \"reco
 [packages.tool.pdm]
 dependencies = []
 
+[[packages]]
+name = "pandas-stubs"
+version = "2.3.2.250926"
+requires-python = ">=3.10"
+sdist = {name = "pandas_stubs-2.3.2.250926.tar.gz", url = "https://files.pythonhosted.org/packages/1b/3b/32be58a125db39d0b5f62cc93795f32b5bb2915bd5c4a46f0e35171985e2/pandas_stubs-2.3.2.250926.tar.gz", hashes = {sha256 = "c64b9932760ceefb96a3222b953e6a251321a9832a28548be6506df473a66406"}}
+wheels = [
+    {name = "pandas_stubs-2.3.2.250926-py3-none-any.whl",url = "https://files.pythonhosted.org/packages/40/96/1e4a035eaf4dce9610aac6e43026d0c6baa05773daf6d21e635a4fe19e21/pandas_stubs-2.3.2.250926-py3-none-any.whl",hashes = {sha256 = "81121818453dcfe00f45c852f4dceee043640b813830f6e7bd084a4ef7ff7270"}},
+]
+marker = "\"dev\" in extras"
+
+[packages.tool.pdm]
+dependencies = [
+    "numpy>=1.23.5",
+    "types-pytz>=2022.1.1",
+]
+
 [[packages]]
 name = "protobuf"
 version = "6.32.1"
@@ -1306,6 +1322,19 @@ dependencies = [
     "setuptools>=70.1.0",
 ]
 
+[[packages]]
+name = "tabulate"
+version = "0.9.0"
+requires-python = ">=3.7"
+sdist = {name = "tabulate-0.9.0.tar.gz", url = "https://files.pythonhosted.org/packages/ec/fe/802052aecb21e3797b8f7902564ab6ea0d60ff8ca23952079064155d1ae1/tabulate-0.9.0.tar.gz", hashes = {sha256 = "0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c"}}
+wheels = [
+    {name = "tabulate-0.9.0-py3-none-any.whl",url = "https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl",hashes = {sha256 = "024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f"}},
+]
+marker = "\"default\" in dependency_groups"
+
+[packages.tool.pdm]
+dependencies = []
+
 [[packages]]
 name = "transformers"
 version = "4.57.1"
@@ -1781,11 +1810,11 @@ dependencies = []
 
 [[packages]]
 name = "fsspec"
-version = "2025.9.0"
+version = "2025.10.0"
 requires-python = ">=3.9"
-sdist = {name = "fsspec-2025.9.0.tar.gz", url = "https://files.pythonhosted.org/packages/de/e0/bab50af11c2d75c9c4a2a26a5254573c0bd97cea152254401510950486fa/fsspec-2025.9.0.tar.gz", hashes = {sha256 = "19fd429483d25d28b65ec68f9f4adc16c17ea2c7c7bf54ec61360d478fb19c19"}}
+sdist = {name = "fsspec-2025.10.0.tar.gz", url = "https://files.pythonhosted.org/packages/24/7f/2747c0d332b9acfa75dc84447a066fdf812b5a6b8d30472b74d309bfe8cb/fsspec-2025.10.0.tar.gz", hashes = {sha256 = "b6789427626f068f9a83ca4e8a3cc050850b6c0f71f99ddb4f542b8266a26a59"}}
 wheels = [
-    {name = "fsspec-2025.9.0-py3-none-any.whl",url = "https://files.pythonhosted.org/packages/47/71/70db47e4f6ce3e5c37a607355f80da8860a33226be640226ac52cb05ef2e/fsspec-2025.9.0-py3-none-any.whl",hashes = {sha256 = "530dc2a2af60a414a832059574df4a6e10cce927f6f4a78209390fe38955cfb7"}},
+    {name = "fsspec-2025.10.0-py3-none-any.whl",url = "https://files.pythonhosted.org/packages/eb/02/a6b21098b1d5d6249b7c5ab69dde30108a71e4e819d4a9778f1de1d5b70d/fsspec-2025.10.0-py3-none-any.whl",hashes = {sha256 = "7c7712353ae7d875407f97715f0e1ffcc21e33d5b24556cb1e090ae9409ec61d"}},
 ]
 marker = "\"default\" in dependency_groups or \"all\" in extras or \"audio\" in extras or \"dev\" in extras or \"vision\" in extras"
 
@@ -3596,6 +3625,19 @@ dependencies = [
     "html5tagger>=1.2.1",
 ]
 
+[[packages]]
+name = "types-pytz"
+version = "2025.2.0.20250809"
+requires-python = ">=3.9"
+sdist = {name = "types_pytz-2025.2.0.20250809.tar.gz", url = "https://files.pythonhosted.org/packages/07/e2/c774f754de26848f53f05defff5bb21dd9375a059d1ba5b5ea943cf8206e/types_pytz-2025.2.0.20250809.tar.gz", hashes = {sha256 = "222e32e6a29bb28871f8834e8785e3801f2dc4441c715cd2082b271eecbe21e5"}}
+wheels = [
+    {name = "types_pytz-2025.2.0.20250809-py3-none-any.whl",url = "https://files.pythonhosted.org/packages/db/d0/91c24fe54e565f2344d7a6821e6c6bb099841ef09007ea6321a0bac0f808/types_pytz-2025.2.0.20250809-py3-none-any.whl",hashes = {sha256 = "4f55ed1b43e925cf851a756fe1707e0f5deeb1976e15bf844bcaa025e8fbd0db"}},
+]
+marker = "\"dev\" in extras"
+
+[packages.tool.pdm]
+dependencies = []
+
 [[packages]]
 name = "ujson"
 version = "5.11.0"
@@ -4464,7 +4506,7 @@ marker = "python_full_version >= \"3.10.0\" and python_full_version < \"3.10.2\"
 dependencies = []
 
 [tool.pdm]
-hashes = {sha256 = "a61aad0c4563f9e4a33622000214136c2a7aa01d28a2e89e220a415039e7e3eb"}
+hashes = {sha256 = "78b9a92a016e9cc24989f5691183181d059a55ee416647f9e8d00bd35cd38c35"}
 strategy = ["inherit_metadata", "static_urls"]
 
 [[tool.pdm.targets]]
diff --git a/pyproject.toml b/pyproject.toml
index 1ba5a92f..6583b1b4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -69,6 +69,7 @@ dependencies = [
     "pyyaml>=6.0.0",
     "rich",
     "sanic",
+    "tabulate",
     "transformers",
     "uvloop>=0.18",
     "torch",
@@ -128,6 +129,7 @@ dev = [
     "mdformat-gfm~=0.3.6",
 
     # type-checking
+    "pandas-stubs",
     "types-PyYAML~=6.0.1",
     "types-requests~=2.32.0",
     "types-toml",
diff --git a/src/guidellm/__main__.py b/src/guidellm/__main__.py
index 1faaaafa..3a2bd6c4 100644
--- a/src/guidellm/__main__.py
+++ b/src/guidellm/__main__.py
@@ -45,26 +45,12 @@
     reimport_benchmarks_report,
 )
 from guidellm.mock_server import MockServer, MockServerConfig
-from guidellm.preprocess.dataset import ShortPromptStrategy, process_dataset
 from guidellm.scheduler import StrategyType
 from guidellm.schemas import GenerativeRequestType
 from guidellm.settings import print_config
 from guidellm.utils import Console, DefaultGroupHandler, get_literal_vals
 from guidellm.utils import cli as cli_tools
 
-__all__ = [
-    "STRATEGY_PROFILE_CHOICES",
-    "benchmark",
-    "cli",
-    "config",
-    "dataset",
-    "decode_escaped_str",
-    "from_file",
-    "mock_server",
-    "preprocess",
-    "run",
-]
-
 STRATEGY_PROFILE_CHOICES: list[str] = list(get_literal_vals(ProfileType | StrategyType))
 """Available strategy and profile type choices for benchmark execution."""
 
@@ -256,7 +242,7 @@ def benchmark():
     help="Number of worker processes for data loading.",
 )
 @click.option(
-    "--dataloader_kwargs",
+    "--dataloader-kwargs",
     default=BenchmarkGenerativeTextArgs.get_default("dataloader_kwargs"),
     callback=cli_tools.parse_json,
     help="JSON string of arguments to pass to the dataloader constructor.",
@@ -469,128 +455,6 @@ def preprocess():
     """Dataset preprocessing utilities."""
 
 
-@preprocess.command(
-    "dataset",
-    help=(
-        "Process a dataset to have specific prompt and output token sizes. "
-        "Supports multiple strategies for handling prompts and optional "
-        "Hugging Face Hub upload.\n\n"
-        "DATA: Path to the input dataset or dataset ID.\n\n"
-        "OUTPUT_PATH: Path to save the processed dataset, including file suffix."
-    ),
-    context_settings={"auto_envvar_prefix": "GUIDELLM"},
-)
-@click.argument(
-    "data",
-    type=str,
-    required=True,
-)
-@click.argument(
-    "output_path",
-    type=click.Path(file_okay=True, dir_okay=False, writable=True, resolve_path=True),
-    required=True,
-)
-@click.option(
-    "--processor",
-    type=str,
-    required=True,
-    help="Processor or tokenizer name for calculating token counts.",
-)
-@click.option(
-    "--processor-args",
-    default=None,
-    callback=cli_tools.parse_json,
-    help="JSON string of arguments to pass to the processor constructor.",
-)
-@click.option(
-    "--data-args",
-    callback=cli_tools.parse_json,
-    help="JSON string of arguments to pass to dataset creation.",
-)
-@click.option(
-    "--short-prompt-strategy",
-    type=click.Choice([s.value for s in ShortPromptStrategy]),
-    default=ShortPromptStrategy.IGNORE.value,
-    show_default=True,
-    help="Strategy for handling prompts shorter than target length.",
-)
-@click.option(
-    "--pad-char",
-    type=str,
-    default="",
-    callback=decode_escaped_str,
-    help="Character to pad short prompts with when using 'pad' strategy.",
-)
-@click.option(
-    "--concat-delimiter",
-    type=str,
-    default="",
-    help=(
-        "Delimiter for concatenating short prompts (used with 'concatenate' strategy)."
-    ),
-)
-@click.option(
-    "--prompt-tokens",
-    type=str,
-    default=None,
-    help="Prompt tokens configuration (JSON, YAML file, or key=value string).",
-)
-@click.option(
-    "--output-tokens",
-    type=str,
-    default=None,
-    help="Output tokens configuration (JSON, YAML file, or key=value string).",
-)
-@click.option(
-    "--push-to-hub",
-    is_flag=True,
-    help="Push the processed dataset to Hugging Face Hub.",
-)
-@click.option(
-    "--hub-dataset-id",
-    type=str,
-    default=None,
-    help=("Hugging Face Hub dataset ID for upload (required if --push-to-hub is set)."),
-)
-@click.option(
-    "--random-seed",
-    type=int,
-    default=42,
-    show_default=True,
-    help="Random seed for reproducible token sampling.",
-)
-def dataset(
-    data,
-    output_path,
-    processor,
-    processor_args,
-    data_args,
-    short_prompt_strategy,
-    pad_char,
-    concat_delimiter,
-    prompt_tokens,
-    output_tokens,
-    push_to_hub,
-    hub_dataset_id,
-    random_seed,
-):
-    process_dataset(
-        data=data,
-        output_path=output_path,
-        processor=processor,
-        prompt_tokens=prompt_tokens,
-        output_tokens=output_tokens,
-        processor_args=processor_args,
-        data_args=data_args,
-        short_prompt_strategy=short_prompt_strategy,
-        pad_char=pad_char,
-        concat_delimiter=concat_delimiter,
-        push_to_hub=push_to_hub,
-        hub_dataset_id=hub_dataset_id,
-        random_seed=random_seed,
-    )
-
-
 @cli.command(
     "mock-server",
     help=(
diff --git a/src/guidellm/backends/backend.py b/src/guidellm/backends/backend.py
index 89169a48..bc3fe37a 100644
--- a/src/guidellm/backends/backend.py
+++ b/src/guidellm/backends/backend.py
@@ -102,9 +102,8 @@ def requests_limit(self) -> int | None:
         return None
 
     @abstractmethod
-    async def default_model(self) -> str | None:
+    async def default_model(self) -> str:
         """
         :return: The default model name or identifier for generation requests,
-            None if no default model is available
         """
         ...
diff --git a/src/guidellm/backends/openai.py b/src/guidellm/backends/openai.py
index 1e74fc6e..22b411ae 100644
--- a/src/guidellm/backends/openai.py
+++ b/src/guidellm/backends/openai.py
@@ -54,7 +54,7 @@ class OpenAIHTTPBackend(Backend):
     def __init__(
         self,
         target: str,
-        model: str | None = None,
+        model: str = "",
         api_routes: dict[str, str] | None = None,
         response_handlers: dict[str, Any] | None = None,
         timeout: float = 60.0,
@@ -192,7 +192,7 @@ async def available_models(self) -> list[str]:
 
         return [item["id"] for item in response.json()["data"]]
 
-    async def default_model(self) -> str | None:
+    async def default_model(self) -> str:
         """
         Get the default model for this backend.
 
@@ -202,9 +202,9 @@ async def default_model(self) -> str | None:
             return self.model
 
         models = await self.available_models()
-        return models[0] if models else None
+        return models[0] if models else ""
 
-    async def resolve(
+    async def resolve(  # type: ignore[override]
         self,
         request: GenerationRequest,
         request_info: RequestInfo,
@@ -230,11 +230,9 @@ async def resolve(
         if history is not None:
             raise NotImplementedError("Multi-turn requests not yet supported")
 
-        response_handler = self._resolve_response_handler(
-            request_type=request.request_type
-        )
         if (request_path := self.api_routes.get(request.request_type)) is None:
             raise ValueError(f"Unsupported request type '{request.request_type}'")
+
         request_url = f"{self.target}/{request_path}"
         request_files = (
             {
@@ -246,6 +244,9 @@ async def resolve(
         )
         request_json = request.arguments.body if not request_files else None
         request_data = request.arguments.body if request_files else None
+        response_handler = self._resolve_response_handler(
+            request_type=request.request_type
+        )
 
         if not request.arguments.stream:
             request_info.timings.request_start = time.time()
@@ -282,24 +283,22 @@ async def resolve(
                 async for chunk in stream.aiter_lines():
                     iter_time = time.time()
 
-                    if (
-                        (iterations := response_handler.add_streaming_line(chunk))
-                        is None
-                        or iterations < 0
-                        or end_reached
-                    ):
+                    if request_info.timings.first_request_iteration is None:
+                        request_info.timings.first_request_iteration = iter_time
+                    request_info.timings.last_request_iteration = iter_time
+                    request_info.timings.request_iterations += 1
+
+                    iterations = response_handler.add_streaming_line(chunk)
+                    if iterations is None or iterations <= 0 or end_reached:
                         end_reached = end_reached or iterations is None
                         continue
 
-                    if (
-                        request_info.timings.first_iteration is None
-                        or request_info.timings.iterations is None
-                    ):
-                        request_info.timings.first_iteration = iter_time
-                        request_info.timings.iterations = 0
+                    if request_info.timings.first_token_iteration is None:
+                        request_info.timings.first_token_iteration = iter_time
+                        request_info.timings.token_iterations = 0
 
-                    request_info.timings.last_iteration = iter_time
-                    request_info.timings.iterations += iterations
+                    request_info.timings.last_token_iteration = iter_time
+                    request_info.timings.token_iterations += iterations
 
             request_info.timings.request_end = time.time()
             yield response_handler.compile_streaming(request), request_info
diff --git a/src/guidellm/backends/response_handlers.py b/src/guidellm/backends/response_handlers.py
index b7bd06ad..18aaf320 100644
--- a/src/guidellm/backends/response_handlers.py
+++ b/src/guidellm/backends/response_handlers.py
@@ -9,7 +9,7 @@
 
 from __future__ import annotations
 
-from typing import Any, Protocol
+from typing import Any, Protocol, cast
 
 from guidellm.schemas import GenerationRequest, GenerationResponse, UsageMetrics
 from guidellm.utils import RegistryMixin, json
@@ -109,14 +109,15 @@ def compile_non_streaming(
         :return: Standardized GenerationResponse with extracted text and metrics
         """
         choices, usage = self.extract_choices_and_usage(response)
-        input_metrics, output_metrics = self.extract_metrics(usage)
+        text = choices[0].get("text", "") if choices else ""
+        input_metrics, output_metrics = self.extract_metrics(usage, text)
 
         return GenerationResponse(
             request_id=request.request_id,
             request_args=str(
                 request.arguments.model_dump() if request.arguments else None
             ),
-            text=choices[0].get("text", "") if choices else "",
+            text=text,
             input_metrics=input_metrics,
             output_metrics=output_metrics,
         )
@@ -137,7 +138,7 @@ def add_streaming_line(self, line: str) -> int | None:
         updated = False
         choices, usage = self.extract_choices_and_usage(data)
 
-        if text := choices[0].get("text"):
+        if choices and (text := choices[0].get("text")):
             self.streaming_texts.append(text)
             updated = True
 
@@ -153,14 +154,15 @@ def compile_streaming(self, request: GenerationRequest) -> GenerationResponse:
         :param request: Original generation request
         :return: Standardized GenerationResponse with concatenated text and metrics
         """
-        input_metrics, output_metrics = self.extract_metrics(self.streaming_usage)
+        text = "".join(self.streaming_texts)
+        input_metrics, output_metrics = self.extract_metrics(self.streaming_usage, text)
 
         return GenerationResponse(
             request_id=request.request_id,
             request_args=str(
                 request.arguments.model_dump() if request.arguments else None
             ),
-            text="".join(self.streaming_texts),
+            text=text,
             input_metrics=input_metrics,
             output_metrics=output_metrics,
         )
@@ -194,25 +196,34 @@ def extract_choices_and_usage(
         return response.get("choices", []), response.get("usage", {})
 
     def extract_metrics(
-        self, usage: dict[str, int | dict[str, int]] | None
+        self, usage: dict[str, int | dict[str, int]] | None, text: str
     ) -> tuple[UsageMetrics, UsageMetrics]:
         """
         Extract input and output usage metrics from API response usage data.
 
         :param usage: Usage data dictionary from API response
+        :param text: Generated text for calculating word and character counts
         :return: Tuple of input_metrics and output_metrics as UsageMetrics objects
         """
         if not usage:
-            return UsageMetrics(), UsageMetrics()
+            return UsageMetrics(), UsageMetrics(
+                text_words=len(text.split()) if text else 0,
+                text_characters=len(text) if text else 0,
+            )
 
-        input_details: dict[str, int] = usage.get("prompt_tokens_details", {}) or {}
-        output_details: dict[str, int] = (
-            usage.get("completion_tokens_details", {}) or {}
+        input_details: dict[str, int] = cast(
+            "dict[str, int]", usage.get("prompt_tokens_details", {}) or {}
+        )
+        output_details: dict[str, int] = cast(
+            "dict[str, int]", usage.get("completion_tokens_details", {}) or {}
         )
+        usage_metrics: dict[str, int] = cast("dict[str, int]", usage)
 
         return UsageMetrics(
             text_tokens=(
-                input_details.get("prompt_tokens") or usage.get("prompt_tokens")
+                input_details.get("prompt_tokens")
+                or usage_metrics.get("prompt_tokens")
+                or 0
             ),
             image_tokens=input_details.get("image_tokens"),
             video_tokens=input_details.get("video_tokens"),
@@ -221,8 +232,11 @@ def extract_metrics(
         ), UsageMetrics(
             text_tokens=(
                 output_details.get("completion_tokens")
-                or usage.get("completion_tokens")
+                or usage_metrics.get("completion_tokens")
+                or 0
             ),
+            text_words=len(text.split()) if text else 0,
+            text_characters=len(text) if text else 0,
             image_tokens=output_details.get("image_tokens"),
             video_tokens=output_details.get("video_tokens"),
             audio_tokens=output_details.get("audio_tokens"),
@@ -254,14 +268,16 @@ def compile_non_streaming(
         :return: Standardized GenerationResponse with extracted content and metrics
         """
         choices, usage = self.extract_choices_and_usage(response)
-        input_metrics, output_metrics = self.extract_metrics(usage)
+        choice = choices[0] if choices else {}
+        text = choice.get("content", "")
+        input_metrics, output_metrics = self.extract_metrics(usage, text)
 
         return GenerationResponse(
             request_id=request.request_id,
             request_args=str(
                 request.arguments.model_dump() if request.arguments else None
             ),
-            text=(choices[0].get("message", {}).get("content", "") if choices else ""),
+            text=text,
             input_metrics=input_metrics,
             output_metrics=output_metrics,
         )
@@ -298,14 +314,15 @@ def compile_streaming(self, request: GenerationRequest) -> GenerationResponse:
         :param request: Original generation request
         :return: Standardized GenerationResponse with concatenated content and metrics
         """
-        input_metrics, output_metrics = self.extract_metrics(self.streaming_usage)
+        text = "".join(self.streaming_texts)
+        input_metrics, output_metrics = self.extract_metrics(self.streaming_usage, text)
 
         return GenerationResponse(
             request_id=request.request_id,
             request_args=str(
                 request.arguments.model_dump() if request.arguments else None
             ),
-            text="".join(self.streaming_texts),
+            text=text,
             input_metrics=input_metrics,
             output_metrics=output_metrics,
         )
@@ -352,10 +369,9 @@ def compile_non_streaming(
         :param response: Complete API response containing text and usage data
         :return: Standardized GenerationResponse with extracted text and metrics
         """
-        usage: dict[str, int | dict[str, int]] = response.get("usage", {})
-        input_details: dict[str, int] = usage.get("input_token_details", {}) or {}
-        output_details: dict[str, int] = usage.get("output_token_details", {}) or {}
         text: str = response.get("text", "")
+        usage: dict[str, int | dict[str, int]] = response.get("usage", {})
+        input_metrics, output_metrics = self.extract_metrics(usage, text)
 
         return GenerationResponse(
             request_id=request.request_id,
@@ -363,18 +379,8 @@ def compile_non_streaming(
                 request.arguments.model_dump() if request.arguments else None
             ),
             text=text,
-            input_metrics=UsageMetrics(
-                text_tokens=input_details.get("text_tokens", usage.get("input_tokens")),
-                audio_tokens=input_details.get(
-                    "audio_tokens", usage.get("input_tokens")
-                ),
-                audio_seconds=input_details.get("seconds", usage.get("seconds")),
-            ),
-            output_metrics=UsageMetrics(
-                text_tokens=output_details.get(
-                    "text_tokens", usage.get("output_tokens")
-                ),
-            ),
+            input_metrics=input_metrics,
+            output_metrics=output_metrics,
         )
 
     def add_streaming_line(self, line: str) -> int | None:
@@ -394,8 +400,6 @@ def add_streaming_line(self, line: str) -> int | None:
             return 0
 
         data: dict[str, Any] = json.loads(line)
-        text: str
-        usage: dict[str, int | dict[str, int]]
         updated = False
 
         if text := data.get("text"):
@@ -414,20 +418,21 @@ def compile_streaming(self, request: GenerationRequest) -> GenerationResponse:
         :param request: Original generation request
         :return: Standardized GenerationResponse with concatenated text and metrics
         """
-        input_metrics, output_metrics = self.extract_metrics(self.streaming_usage)
+        text = "".join(self.streaming_texts)
+        input_metrics, output_metrics = self.extract_metrics(self.streaming_usage, text)
 
         return GenerationResponse(
             request_id=request.request_id,
             request_args=str(
                 request.arguments.model_dump() if request.arguments else None
             ),
-            text="".join(self.streaming_texts),
+            text=text,
             input_metrics=input_metrics,
             output_metrics=output_metrics,
         )
 
     def extract_metrics(
-        self, usage: dict[str, int | dict[str, int]] | None
+        self, usage: dict[str, int | dict[str, int]] | None, text: str
     ) -> tuple[UsageMetrics, UsageMetrics]:
         """
         Extract input and output usage metrics from audio API response usage data.
@@ -436,20 +441,40 @@ def extract_metrics(
         in addition to standard text token counts.
 
         :param usage: Usage data dictionary from audio API response
+        :param text: Generated text for calculating word and character counts
         :return: Tuple of input_metrics and output_metrics as UsageMetrics objects
         """
         if not usage:
-            return UsageMetrics(), UsageMetrics()
+            return UsageMetrics(), UsageMetrics(
+                text_words=len(text.split()) if text else 0,
+                text_characters=len(text) if text else 0,
+            )
 
-        input_details: dict[str, int] = usage.get("input_token_details", {}) or {}
-        output_details: dict[str, int] = usage.get("output_token_details", {}) or {}
+        input_details: dict[str, int] = cast(
+            "dict[str, int]", usage.get("input_token_details", {}) or {}
+        )
+        output_details: dict[str, int] = cast(
+            "dict[str, int]", usage.get("output_token_details", {}) or {}
+        )
+        usage_metrics: dict[str, int] = cast("dict[str, int]", usage)
 
         return UsageMetrics(
-            text_tokens=(input_details.get("text_tokens") or usage.get("input_tokens")),
+            text_tokens=input_details.get("text_tokens") or 0,
             audio_tokens=(
-                input_details.get("audio_tokens") or usage.get("audio_tokens")
+                input_details.get("audio_tokens")
+                or usage_metrics.get("audio_tokens")
+                or usage_metrics.get("input_tokens")
+                or 0
+            ),
+            audio_seconds=(
+                input_details.get("seconds") or usage_metrics.get("seconds") or 0
             ),
-            audio_seconds=(input_details.get("seconds") or usage.get("seconds")),
         ), UsageMetrics(
-            text_tokens=output_details.get("text_tokens") or usage.get("output_tokens"),
+            text_tokens=(
+                output_details.get("text_tokens")
+                or usage_metrics.get("output_tokens")
+                or 0
+            ),
+            text_words=len(text.split()) if text else 0,
+            text_characters=len(text) if text else 0,
         )
diff --git a/src/guidellm/benchmark/__init__.py b/src/guidellm/benchmark/__init__.py
index ef7b2900..ed153881 100644
--- a/src/guidellm/benchmark/__init__.py
+++ b/src/guidellm/benchmark/__init__.py
@@ -12,7 +12,7 @@
 
 from .benchmarker import Benchmarker
 from .entrypoints import benchmark_generative_text, reimport_benchmarks_report
-from .output import (
+from .outputs import (
     GenerativeBenchmarkerConsole,
     GenerativeBenchmarkerCSV,
     GenerativeBenchmarkerHTML,
@@ -31,34 +31,43 @@
 from .scenarios import get_builtin_scenarios
 from .schemas import (
     Benchmark,
-    BenchmarkerArgs,
-    BenchmarkerDict,
+    BenchmarkAccumulator,
+    BenchmarkAccumulatorT,
+    BenchmarkConfig,
     BenchmarkGenerativeTextArgs,
-    BenchmarkSchedulerStats,
-    EstimatedBenchmarkState,
+    BenchmarkT,
     GenerativeAudioMetricsSummary,
     GenerativeBenchmark,
+    GenerativeBenchmarkAccumulator,
     GenerativeBenchmarksReport,
+    GenerativeBenchmarkTimings,
     GenerativeImageMetricsSummary,
     GenerativeMetrics,
+    GenerativeMetricsAccumulator,
     GenerativeMetricsSummary,
+    GenerativeRequestsAccumulator,
+    GenerativeTextMetricsSummary,
     GenerativeVideoMetricsSummary,
-    SchedulerDict,
+    RunningMetricStats,
+    SchedulerMetrics,
+    SchedulerMetricsAccumulator,
 )
 
 __all__ = [
     "AsyncProfile",
     "Benchmark",
+    "BenchmarkAccumulator",
+    "BenchmarkAccumulatorT",
+    "BenchmarkConfig",
     "BenchmarkGenerativeTextArgs",
-    "BenchmarkSchedulerStats",
+    "BenchmarkT",
     "Benchmarker",
-    "BenchmarkerArgs",
-    "BenchmarkerDict",
     "BenchmarkerProgress",
     "ConcurrentProfile",
-    "EstimatedBenchmarkState",
     "GenerativeAudioMetricsSummary",
     "GenerativeBenchmark",
+    "GenerativeBenchmarkAccumulator",
+    "GenerativeBenchmarkTimings",
     "GenerativeBenchmarkerCSV",
     "GenerativeBenchmarkerConsole",
     "GenerativeBenchmarkerHTML",
@@ -67,11 +76,16 @@
     "GenerativeConsoleBenchmarkerProgress",
     "GenerativeImageMetricsSummary",
     "GenerativeMetrics",
+    "GenerativeMetricsAccumulator",
     "GenerativeMetricsSummary",
+    "GenerativeRequestsAccumulator",
+    "GenerativeTextMetricsSummary",
     "GenerativeVideoMetricsSummary",
     "Profile",
     "ProfileType",
-    "SchedulerDict",
+    "RunningMetricStats",
+    "SchedulerMetrics",
+    "SchedulerMetricsAccumulator",
     "SweepProfile",
     "SynchronousProfile",
     "ThroughputProfile",
diff --git a/src/guidellm/benchmark/benchmarker.py b/src/guidellm/benchmark/benchmarker.py
index 8a46d44e..2195ea59 100644
--- a/src/guidellm/benchmark/benchmarker.py
+++ b/src/guidellm/benchmark/benchmarker.py
@@ -1,11 +1,11 @@
 """
 Benchmark execution orchestration and lifecycle management.
 
-Provides the core benchmarking engine that coordinates request scheduling,
-data aggregation, and result compilation across different execution strategies
-and environments. The Benchmarker acts as the primary workflow coordinator,
-managing the complete benchmark lifecycle from request submission through
-result compilation while supporting thread-safe singleton operations.
+Provides the core benchmarking engine coordinating request scheduling,
+data aggregation, and result compilation across execution strategies
+and environments. The Benchmarker manages the complete benchmark lifecycle
+from request submission through result compilation while supporting
+thread-safe singleton operations for consistent state management.
 """
 
 from __future__ import annotations
@@ -13,24 +13,28 @@
 import uuid
 from abc import ABC
 from collections.abc import AsyncIterator, Iterable
-from typing import Any, Generic
+from typing import Generic
 
 from guidellm.benchmark.profile import Profile
 from guidellm.benchmark.progress import BenchmarkerProgress
 from guidellm.benchmark.schemas import (
-    BenchmarkerArgs,
+    BenchmarkAccumulatorT,
+    BenchmarkConfig,
     BenchmarkT,
-    EstimatedBenchmarkState,
 )
 from guidellm.logger import logger
 from guidellm.scheduler import (
     BackendInterface,
+    Constraint,
     Environment,
+    MultiTurnRequestT,
     RequestT,
     ResponseT,
     Scheduler,
+    SchedulingStrategy,
 )
 from guidellm.utils import ThreadSafeSingletonMixin
+from guidellm.utils.mixins import InfoMixin
 
 __all__ = ["Benchmarker"]
 
@@ -43,46 +47,45 @@ class Benchmarker(
     """
     Abstract benchmark orchestrator for request processing workflows.
 
-    Coordinates execution of benchmarking runs across different scheduling
-    strategies, aggregating metrics and compiling results. Manages the complete
-    benchmark lifecycle from request submission through result compilation while
-    implementing thread-safe singleton pattern to ensure consistent state across
-    concurrent operations.
+    Coordinates benchmarking runs across scheduling strategies, aggregating
+    metrics and compiling results. Manages the complete benchmark lifecycle
+    from request submission through result compilation while implementing a
+    thread-safe singleton pattern for consistent state across concurrent
+    operations.
     """
 
     async def run(
         self,
+        accumulator_class: type[BenchmarkAccumulatorT],
         benchmark_class: type[BenchmarkT],
-        requests: Iterable[RequestT | Iterable[RequestT | tuple[RequestT, float]]],
+        requests: Iterable[RequestT | MultiTurnRequestT[RequestT]],
         backend: BackendInterface[RequestT, ResponseT],
         profile: Profile,
         environment: Environment,
-        data: list[Any],
-        progress: BenchmarkerProgress[BenchmarkT] | None = None,
+        progress: (
+            BenchmarkerProgress[BenchmarkAccumulatorT, BenchmarkT] | None
+        ) = None,
         sample_requests: int | None = 20,
         warmup: float | None = None,
         cooldown: float | None = None,
         prefer_response_metrics: bool = True,
     ) -> AsyncIterator[BenchmarkT]:
         """
-        Execute benchmark runs across multiple scheduling strategies.
-
-        Orchestrates the complete benchmark workflow by iterating through scheduling
-        strategies from the profile, executing requests through the scheduler,
-        aggregating metrics, and compiling final benchmark results.
-
-        :param benchmark_class: Class for constructing final benchmark objects
-        :param requests: Request datasets for processing across strategies
-        :param backend: Backend interface for request processing
-        :param profile: Benchmark profile defining strategies and constraints
-        :param environment: Execution environment for coordination
-        :param progress: Optional progress tracker for benchmark lifecycle events
-        :param sample_requests: Number of sample requests to use for estimation
-        :param warmup: Optional warmup duration in seconds before benchmarking
-        :param cooldown: Optional cooldown duration in seconds after benchmarking
-        :param prefer_response_metrics: Whether to prefer response-based metrics over
-            request-based metrics
-        :yield: Compiled benchmark results for each strategy execution
+        Execute benchmark runs across scheduling strategies defined in the profile.
+
+        :param accumulator_class: Class for accumulating metrics during execution
+        :param benchmark_class: Class for constructing final benchmark results
+        :param requests: Request datasets to process across strategies
+        :param backend: Backend interface for executing requests
+        :param profile: Profile defining scheduling strategies and constraints
+        :param environment: Environment for execution coordination
+        :param progress: Optional tracker for benchmark lifecycle events
+        :param sample_requests: Number of requests to sample for estimation
+        :param warmup: Warmup duration in seconds before benchmarking
+        :param cooldown: Cooldown duration in seconds after benchmarking
+        :param prefer_response_metrics: Whether to prefer response metrics over
+            request metrics
+        :yield: Compiled benchmark result for each strategy execution
         :raises Exception: If benchmark execution or compilation fails
         """
         with self.thread_lock:
@@ -91,21 +94,38 @@ async def run(
 
             run_id = str(uuid.uuid4())
             strategies_generator = profile.strategies_generator()
+            strategy: SchedulingStrategy | None
+            constraints: dict[str, Constraint] | None
             strategy, constraints = next(strategies_generator)
 
             while strategy is not None:
                 if progress:
                     await progress.on_benchmark_start(strategy)
 
-                args = BenchmarkerArgs(
+                config = BenchmarkConfig(
                     run_id=run_id,
                     run_index=len(profile.completed_strategies),
+                    strategy=strategy,
+                    constraints=(
+                        {
+                            key: InfoMixin.extract_from_obj(val)
+                            for key, val in constraints.items()
+                        }
+                        if isinstance(constraints, dict)
+                        else {"constraint": InfoMixin.extract_from_obj(constraints)}
+                        if constraints
+                        else {}
+                    ),
                     sample_requests=sample_requests,
                     warmup=warmup,
                     cooldown=cooldown,
                     prefer_response_metrics=prefer_response_metrics,
+                    profile=profile,
+                    requests=InfoMixin.extract_from_obj(requests),
+                    backend=InfoMixin.extract_from_obj(backend),
+                    environment=InfoMixin.extract_from_obj(environment),
                 )
-                estimated_state = EstimatedBenchmarkState()
+                accumulator = accumulator_class(config=config)
                 scheduler_state = None
                 scheduler: Scheduler[RequestT, ResponseT] = Scheduler()
 
@@ -123,9 +143,7 @@ async def run(
                     **constraints or {},
                 ):
                     try:
-                        benchmark_class.update_estimate(
-                            args,
-                            estimated_state,
+                        accumulator.update_estimate(
                             response,
                             request,
                             request_info,
@@ -133,7 +151,7 @@ async def run(
                         )
                         if progress:
                             await progress.on_benchmark_update(
-                                estimated_state, scheduler_state
+                                accumulator, scheduler_state
                             )
                     except Exception as err:  # noqa: BLE001
                         logger.error(
@@ -141,17 +159,10 @@ async def run(
                         )
 
                 benchmark = benchmark_class.compile(
-                    args=args,
-                    estimated_state=estimated_state,
+                    accumulator=accumulator,
                     scheduler_state=scheduler_state,
-                    profile=profile,
-                    requests=requests,
-                    backend=backend,
-                    environment=environment,
-                    strategy=strategy,
-                    constraints=constraints,
-                    data=data,
                 )
+
                 if progress:
                     await progress.on_benchmark_complete(benchmark)
 
diff --git a/src/guidellm/benchmark/entrypoints.py b/src/guidellm/benchmark/entrypoints.py
index e095ed12..c04c89a8 100644
--- a/src/guidellm/benchmark/entrypoints.py
+++ b/src/guidellm/benchmark/entrypoints.py
@@ -1,18 +1,17 @@
 """
-High-level entry points for executing generative text benchmarks.
-
-This module provides the primary interface for running generative text benchmarks
-through the `benchmark_generative_text` function and re-importing existing benchmark
-reports via `reimport_benchmarks_report`. It orchestrates the initialization and
-coordination of backends, data loaders, profiles, and output formats to execute
-comprehensive benchmarking workflows. The module handles all resolution logic for
-converting user-provided arguments into fully configured components ready for
-benchmarking execution.
+Primary interface for executing and re-importing generative text benchmarks.
+
+This module orchestrates comprehensive benchmarking workflows by coordinating backend
+initialization, data loading, profile configuration, and output generation. It provides
+two main entry points: `benchmark_generative_text` for executing new benchmarks and
+`reimport_benchmarks_report` for re-exporting existing results. The resolution functions
+convert user-provided arguments into fully configured components, handling backend
+validation, data preprocessing, profile constraints, and output format specifications.
 """
 
 from __future__ import annotations
 
-from collections.abc import Callable
+from collections.abc import Callable, Mapping, MutableMapping
 from pathlib import Path
 from typing import Any, Literal
 
@@ -22,12 +21,13 @@
 
 from guidellm.backends import Backend, BackendType
 from guidellm.benchmark.benchmarker import Benchmarker
-from guidellm.benchmark.output import GenerativeBenchmarkerOutput
+from guidellm.benchmark.outputs import GenerativeBenchmarkerOutput
 from guidellm.benchmark.profile import Profile, ProfileType
 from guidellm.benchmark.progress import GenerativeConsoleBenchmarkerProgress
 from guidellm.benchmark.schemas import (
     BenchmarkGenerativeTextArgs,
     GenerativeBenchmark,
+    GenerativeBenchmarkAccumulator,
     GenerativeBenchmarksReport,
 )
 from guidellm.data import (
@@ -36,6 +36,7 @@
     GenerativeRequestCollator,
     PreprocessorRegistry,
     ProcessorFactory,
+    RequestFormatter,
 )
 from guidellm.data.preprocessors import GenerativeColumnMapper
 from guidellm.scheduler import (
@@ -44,6 +45,7 @@
     StrategyType,
 )
 from guidellm.schemas import GenerationRequest, GenerationResponse
+from guidellm.settings import settings
 from guidellm.utils import Console, InfoMixin
 
 __all__ = [
@@ -52,17 +54,22 @@
 ]
 
 
-# Helper Functions
+# Type Aliases
 
 OutputFormatT = TypeAliasType(
     "OutputFormatT",
     tuple[str, ...]
     | list[str]
-    | dict[str, str | dict[str, Any] | GenerativeBenchmarkerOutput]
+    | Mapping[str, str | dict[str, Any] | GenerativeBenchmarkerOutput]
     | None,
 )
+"""Output format specification as strings, mappings, or configured output instances"""
 
 ProcessorInputT = TypeAliasType("ProcessorInputT", str | Path | PreTrainedTokenizerBase)
+"""Processor input as model identifier, path to tokenizer, or tokenizer instance"""
+
+
+# Helper Functions
 
 
 async def resolve_backend(
@@ -71,9 +78,14 @@ async def resolve_backend(
     model: str | None,
     console: Console | None = None,
     **backend_kwargs: dict[str, Any],
-) -> tuple[Backend, str | None]:
+) -> tuple[Backend, str]:
     """
-    Initialize and validate a backend instance for benchmarking.
+    Initialize and validate a backend instance for benchmarking execution.
+
+    Handles backend creation from type identifiers or pre-configured instances,
+    performs startup validation, and resolves the default model if not specified.
+    The backend is shut down after validation to ensure clean state for subsequent
+    benchmark execution.
 
     :param backend: Backend type identifier or pre-configured Backend instance
     :param target: Target endpoint URL or connection string for the backend
@@ -87,17 +99,19 @@ async def resolve_backend(
         if console
         else None
     )
-    backend = (
+    backend_instance = (
         Backend.create(backend, target=target, model=model, **(backend_kwargs or {}))
         if not isinstance(backend, Backend)
         else backend
     )
 
     if console_step:
-        console_step.update(f"{backend.__class__.__name__} backend initialized")
+        console_step.update(
+            f"{backend_instance.__class__.__name__} backend initialized"
+        )
 
-    await backend.process_startup()
-    await backend.validate()
+    await backend_instance.process_startup()
+    await backend_instance.validate()
 
     if model is None:
         if console_step:
@@ -105,20 +119,21 @@ async def resolve_backend(
                 title="Resolving default model from backend.default_model",
                 status_level="info",
             )
-        model = await backend.default_model()
+        model = await backend_instance.default_model()
 
-    await backend.process_shutdown()
+    await backend_instance.process_shutdown()
 
     if console_step:
         console_step.finish(
             title=(
-                f"{backend.__class__.__name__} backend validated with model {model}"
+                f"{backend_instance.__class__.__name__} backend validated "
+                f"with model {model}"
             ),
-            details=backend.info,
+            details=backend_instance.info,
             status_level="success",
         )
 
-    return backend, model
+    return backend_instance, model
 
 
 async def resolve_processor(
@@ -127,7 +142,7 @@ async def resolve_processor(
     console: Console | None = None,
 ) -> ProcessorInputT | None:
     """
-    Resolve the processor for tokenization, defaulting to model if not provided.
+    Resolve the tokenization processor, defaulting to model if not provided.
 
     :param processor: Processor identifier, path, tokenizer instance, or None
     :param model: Model identifier to use as fallback processor
@@ -161,15 +176,17 @@ async def resolve_processor(
 
 async def resolve_request_loader(
     data: list[Any],
-    model: str | None,
+    model: str,
     data_args: list[dict[str, Any]] | None,
     data_samples: int,
     processor: ProcessorInputT | None,
     processor_args: dict[str, Any] | None,
     data_column_mapper: (
-        DatasetPreprocessor | dict[str, str] | Literal["generative_column_mapper"]
+        DatasetPreprocessor
+        | dict[str, str | list[str]]
+        | Literal["generative_column_mapper"]
     ),
-    data_request_formatter: (DatasetPreprocessor | dict[str, str] | str),
+    data_request_formatter: (RequestFormatter | dict[str, str] | str),
     data_collator: Callable | Literal["generative"] | None,
     data_sampler: Sampler[int] | Literal["shuffle"] | None,
     data_num_workers: int | None,
@@ -180,6 +197,11 @@ async def resolve_request_loader(
     """
     Construct a DataLoader for GenerationRequest objects from raw data inputs.
 
+    Initializes and configures the data pipeline including column mapping, request
+    formatting, collation, and sampling. Resolves string-based preprocessor identifiers
+    from the PreprocessorRegistry and creates appropriate instances with provided
+    configurations.
+
     :param data: List of data sources to load requests from
     :param model: Model identifier for request formatting
     :param data_args: Arguments for each data source in the data list
@@ -195,6 +217,10 @@ async def resolve_request_loader(
     :param console: Console instance for progress reporting, or None
     :param dataloader_kwargs: Additional arguments passed to DataLoader initialization
     :return: Configured DataLoader instance for GenerationRequest objects
+    :raises ValueError: If request formatter type is not registered in
+        PreprocessorRegistry
+    :raises TypeError: If registered request formatter is not a RequestFormatter
+        subclass
     """
     console_step = (
         console.print_update_step(title=f"Initializing request loader from {data}")
@@ -202,38 +228,63 @@ async def resolve_request_loader(
         else None
     )
 
-    if not isinstance(data_column_mapper, DatasetPreprocessor):
+    data_column_mapper_instance: DatasetPreprocessor
+    if isinstance(data_column_mapper, DatasetPreprocessor):
+        data_column_mapper_instance = data_column_mapper
+    else:
         column_mappings = (
             data_column_mapper if isinstance(data_column_mapper, dict) else None
         )
-        data_column_mapper = GenerativeColumnMapper(
-            column_mappings=column_mappings,
+        data_column_mapper_instance = GenerativeColumnMapper(
+            column_mappings=column_mappings  # type: ignore[arg-type]
         )
-    if not isinstance(data_request_formatter, DatasetPreprocessor):
-        request_type = (
-            data_request_formatter
-            if isinstance(data_request_formatter, str)
-            else data_request_formatter.pop("request_type", "chat_completions")
-        )
-        data_request_formatter = PreprocessorRegistry.get_registered_object(
-            request_type
-        )(
+
+    data_request_formatter_instance: RequestFormatter
+    if isinstance(data_request_formatter, RequestFormatter):
+        data_request_formatter_instance = data_request_formatter
+    else:
+        if isinstance(data_request_formatter, str):
+            request_type = data_request_formatter
+            formatter_kwargs: dict[str, Any] = {}
+        else:
+            # Extract request_type from formatter dictionary
+            formatter_dict = dict(data_request_formatter)
+            request_type = formatter_dict.pop("request_type", settings.preferred_route)
+            formatter_kwargs = formatter_dict
+
+        if (
+            formatter_class := PreprocessorRegistry.get_registered_object(request_type)
+        ) is None:
+            raise ValueError(
+                f"Request formatter '{request_type}' is not registered in the "
+                f"PreprocessorRegistry."
+            )
+        if not issubclass(formatter_class, RequestFormatter):
+            raise TypeError(
+                f"Request formatter '{request_type}' is not a subclass of "
+                f"RequestFormatter."
+            )
+
+        data_request_formatter_instance = formatter_class(
             model=model,
-            **(
-                data_request_formatter
-                if isinstance(data_request_formatter, dict)
-                else {}
-            ),
+            **formatter_kwargs,
         )
 
-    request_loader = DataLoader(
+    # Cast to proper types for the DataLoader preprocessors list
+    preprocessors_list: list[DatasetPreprocessor] = [
+        data_column_mapper_instance,
+        data_request_formatter_instance,
+    ]
+
+    request_loader: DataLoader[GenerationRequest] = DataLoader(
         data=data,
         data_args=data_args,
         data_samples=data_samples,
         processor_factory=ProcessorFactory(
-            processor=processor, processor_args=processor_args
+            processor=processor if processor is not None else model,
+            processor_args=processor_args,
         ),
-        preprocessors=[data_column_mapper, data_request_formatter],
+        preprocessors=preprocessors_list,
         collator=(
             data_collator if callable(data_collator) else GenerativeRequestCollator()
         ),
@@ -259,9 +310,9 @@ async def resolve_request_loader(
 
 async def resolve_profile(
     profile: StrategyType | ProfileType | Profile,
-    rate: float | list[float] | None,
+    rate: list[float] | None,
     random_seed: int,
-    constraints: dict[str, ConstraintInitializer | Any],
+    constraints: MutableMapping[str, ConstraintInitializer | Any],
     max_seconds: int | float | None,
     max_requests: int | None,
     max_errors: int | None,
@@ -272,6 +323,10 @@ async def resolve_profile(
     """
     Resolve and configure a benchmark profile with rate and constraint settings.
 
+    Constructs a Profile instance from type identifiers or validates pre-configured
+    profiles. Constraint parameters are merged into the constraints dictionary before
+    profile creation.
+
     :param profile: Profile type identifier or pre-configured Profile instance
     :param rate: Request rate(s) for the benchmark execution
     :param random_seed: Seed for reproducible random operations
@@ -361,20 +416,22 @@ async def benchmark_generative_text(
     args: BenchmarkGenerativeTextArgs,
     progress: GenerativeConsoleBenchmarkerProgress | None = None,
     console: Console | None = None,
-    **constraints: dict[str, ConstraintInitializer | Any],
+    **constraints: str | ConstraintInitializer | Any,
 ) -> tuple[GenerativeBenchmarksReport, dict[str, Any]]:
     """
     Execute a comprehensive generative text benchmarking workflow.
 
-    Orchestrates the full benchmarking pipeline by resolving all components (backend,
-    data loader, profile, outputs) from provided arguments, executing the benchmark
-    runs, and finalizing results in the specified output formats.
+    Orchestrates the full benchmarking pipeline by resolving all components from
+    provided arguments, executing benchmark runs across configured profiles, and
+    finalizing results in specified output formats. Components include backend
+    initialization, data loading, profile configuration, and output generation.
 
     :param args: Configuration arguments for the benchmark execution
     :param progress: Progress tracker for benchmark execution, or None for no tracking
     :param console: Console instance for status reporting, or None for silent operation
     :param constraints: Additional constraint initializers for benchmark limits
-    :return: Tuple of GenerativeBenchmarksReport and dictionary of output format results
+    :return: Tuple of GenerativeBenchmarksReport and dictionary of output format
+        results
     """
     backend, model = await resolve_backend(
         backend=args.backend,
@@ -431,12 +488,12 @@ async def benchmark_generative_text(
         GenerativeBenchmark, GenerationRequest, GenerationResponse
     ] = Benchmarker()
     async for benchmark in benchmarker.run(
-        benchmark_class=args.benchmark_cls,
+        accumulator_class=GenerativeBenchmarkAccumulator,
+        benchmark_class=GenerativeBenchmark,
         requests=request_loader,
         backend=backend,
         profile=profile,
         environment=NonDistributedEnvironment(),
-        data=args.data,
         progress=progress,
         sample_requests=args.sample_requests,
         warmup=args.warmup,
@@ -472,12 +529,13 @@ async def reimport_benchmarks_report(
     output_formats: OutputFormatT = ("console", "json", "html", "csv"),
 ) -> tuple[GenerativeBenchmarksReport, dict[str, Any]]:
     """
-    Load and re-export an existing benchmarks report in specified formats.
+    Load and re-export an existing benchmarks report in specified output formats.
 
     :param file: Path to the existing benchmark report file to load
     :param output_path: Base path for output file generation, or None for default
     :param output_formats: Specification of desired output formats for the report
-    :return: Tuple of loaded GenerativeBenchmarksReport and dictionary of output results
+    :return: Tuple of loaded GenerativeBenchmarksReport and dictionary of output
+        results
     """
     console = Console()
 
@@ -490,11 +548,11 @@ async def reimport_benchmarks_report(
             f" loaded {len(report.benchmarks)} benchmark(s)"
         )
 
-    output_formats = await resolve_output_formats(
+    resolved_output_formats = await resolve_output_formats(
         output_formats, output_path, console=console
     )
     output_format_results = {}
-    for key, output in output_formats.items():
+    for key, output in resolved_output_formats.items():
         output_result = await output.finalize(report)
         output_format_results[key] = output_result
 
diff --git a/src/guidellm/benchmark/output.py b/src/guidellm/benchmark/output.py
deleted file mode 100644
index 6e17de5b..00000000
--- a/src/guidellm/benchmark/output.py
+++ /dev/null
@@ -1,745 +0,0 @@
-from __future__ import annotations
-
-import csv
-import json
-import math
-from abc import ABC, abstractmethod
-from collections import OrderedDict
-from copy import deepcopy
-from datetime import datetime
-from pathlib import Path
-from typing import Any, ClassVar
-
-from pydantic import BaseModel, ConfigDict, Field
-from rich.console import Console
-from rich.padding import Padding
-from rich.text import Text
-
-from guidellm.benchmark.profile import (
-    AsyncProfile,
-    ConcurrentProfile,
-    SweepProfile,
-    ThroughputProfile,
-)
-from guidellm.benchmark.schemas import (
-    GenerativeBenchmark,
-    GenerativeBenchmarksReport,
-    GenerativeMetrics,
-)
-from guidellm.presentation import UIDataBuilder
-from guidellm.presentation.injector import create_report
-from guidellm.settings import settings
-from guidellm.utils import (
-    Colors,
-    DistributionSummary,
-    RegistryMixin,
-    StatusDistributionSummary,
-    camelize_str,
-    recursive_key_update,
-    safe_format_timestamp,
-    split_text_list_by_length,
-)
-
-__all__ = [
-    "GenerativeBenchmarkerCSV",
-    "GenerativeBenchmarkerConsole",
-    "GenerativeBenchmarkerHTML",
-    "GenerativeBenchmarkerOutput",
-]
-
-
-class GenerativeBenchmarkerOutput(
-    BaseModel, RegistryMixin[type["GenerativeBenchmarkerOutput"]], ABC
-):
-    model_config = ConfigDict(
-        extra="ignore",
-        arbitrary_types_allowed=True,
-        validate_assignment=True,
-        from_attributes=True,
-        use_enum_values=True,
-    )
-
-    @classmethod
-    @abstractmethod
-    def validated_kwargs(cls, *args, **kwargs) -> dict[str, Any]:
-        """
-        Validate and process arguments for constraint creation.
-
-        Must be implemented by subclasses to handle their specific parameter patterns.
-
-        :param args: Positional arguments passed to the constraint
-        :param kwargs: Keyword arguments passed to the constraint
-        :return: Validated dictionary of parameters for constraint creation
-        :raises NotImplementedError: Must be implemented by subclasses
-        """
-        ...
-
-    @classmethod
-    def resolve(
-        cls,
-        output_formats: (
-            tuple[str, ...]
-            | list[str]
-            | dict[
-                str,
-                Any | dict[str, Any] | GenerativeBenchmarkerOutput,
-            ]
-            | None
-        ),
-        output_path: str | Path | None,
-    ) -> dict[str, GenerativeBenchmarkerOutput]:
-        if not output_formats:
-            return {}
-
-        if isinstance(output_formats, list | tuple):
-            # support list of output keys: ["csv", "json"]
-            # support list of files: ["path/to/file.json", "path/to/file.csv"]
-            formats_list = output_formats
-            output_formats = {}
-            for output_format in formats_list:
-                if not isinstance(output_format, str):
-                    raise TypeError(
-                        f"Expected string format, got {type(output_format)} for "
-                        f"{output_format} in {formats_list}"
-                    )
-                try:
-                    if cls.is_registered(output_format):
-                        output_formats[output_format] = {}
-                    else:
-                        # treat it as a file save location
-                        path = Path(output_format)
-                        format_type = path.suffix[1:].lower()
-                        output_formats[format_type] = {"output_path": path}
-
-                except Exception as err:
-                    raise ValueError(
-                        f"Failed to resolve output format '{output_format}': {err}"
-                    ) from err
-
-        resolved = {}
-
-        for key, val in output_formats.items():
-            if isinstance(val, GenerativeBenchmarkerOutput):
-                resolved[key] = val
-            else:
-                output_class = cls.get_registered_object(key)
-                kwargs = {"output_path": output_path}
-
-                if isinstance(val, dict):
-                    kwargs.update(val)
-                    kwargs = output_class.validated_kwargs(**kwargs)
-                else:
-                    kwargs = output_class.validated_kwargs(val, **kwargs)
-
-                resolved[key] = output_class(**kwargs)
-
-        return resolved
-
-    @abstractmethod
-    async def finalize(self, report: GenerativeBenchmarksReport) -> Any: ...
-
-
-@GenerativeBenchmarkerOutput.register(["json", "yaml"])
-class GenerativeBenchmarkerSerialized(GenerativeBenchmarkerOutput):
-    @classmethod
-    def validated_kwargs(
-        cls, output_path: str | Path | None, **_kwargs
-    ) -> dict[str, Any]:
-        new_kwargs = {}
-        if output_path is not None:
-            new_kwargs["output_path"] = (
-                Path(output_path) if not isinstance(output_path, Path) else output_path
-            )
-        return new_kwargs
-
-    output_path: Path = Field(default_factory=lambda: Path.cwd())
-
-    async def finalize(self, report: GenerativeBenchmarksReport) -> Path:
-        return report.save_file(self.output_path)
-
-
-@GenerativeBenchmarkerOutput.register("console")
-class GenerativeBenchmarkerConsole(GenerativeBenchmarkerOutput):
-    """Console output formatter for benchmark results with rich formatting."""
-
-    @classmethod
-    def validated_kwargs(cls, *_args, **_kwargs) -> dict[str, Any]:
-        return {}
-
-    console: Console = Field(default_factory=Console)
-
-    async def finalize(self, report: GenerativeBenchmarksReport) -> str:
-        """
-        Print the complete benchmark report to the console.
-
-        :param report: The completed benchmark report.
-        :return:
-        """
-        self._print_benchmarks_metadata(report.benchmarks)
-        self._print_benchmarks_info(report.benchmarks)
-        self._print_benchmarks_stats(report.benchmarks)
-
-        return "printed to console"
-
-    def _print_benchmarks_metadata(self, benchmarks: list[GenerativeBenchmark]):
-        start_time = benchmarks[0].run_stats.start_time
-        end_time = benchmarks[-1].run_stats.end_time
-        duration = end_time - start_time
-
-        self._print_section_header("Benchmarks Metadata")
-        self._print_labeled_line("Run id", str(benchmarks[0].run_id))
-        self._print_labeled_line("Duration", f"{duration:.1f} seconds")
-        self._print_labeled_line("Profile", self._get_profile_str(benchmarks[0]))
-
-    def _print_benchmarks_info(self, benchmarks: list[GenerativeBenchmark]):
-        sections = {
-            "Metadata": (0, 3),
-            "Requests Made": (4, 6),
-            "Prompt Tok/Req": (7, 9),
-            "Output Tok/Req": (10, 12),
-            "Prompt Tok Total": (13, 15),
-            "Output Tok Total": (16, 18),
-        }
-        headers = [
-            "Benchmark",
-            "Start Time",
-            "End Time",
-            "Duration (s)",
-            "Comp",
-            "Inc",
-            "Err",
-            "Comp",
-            "Inc",
-            "Err",
-            "Comp",
-            "Inc",
-            "Err",
-            "Comp",
-            "Inc",
-            "Err",
-            "Comp",
-            "Inc",
-            "Err",
-        ]
-
-        rows = []
-        for benchmark in benchmarks:
-            rows.append(
-                [
-                    str(benchmark.scheduler.strategy),
-                    safe_format_timestamp(benchmark.start_time),
-                    safe_format_timestamp(benchmark.end_time),
-                    f"{(benchmark.end_time - benchmark.start_time):.1f}",
-                    f"{benchmark.request_totals.successful:.0f}",
-                    f"{benchmark.request_totals.incomplete:.0f}",
-                    f"{benchmark.request_totals.errored:.0f}",
-                    f"{benchmark.metrics.prompt_token_count.successful.mean:.1f}",
-                    f"{benchmark.metrics.prompt_token_count.incomplete.mean:.1f}",
-                    f"{benchmark.metrics.prompt_token_count.errored.mean:.1f}",
-                    f"{benchmark.metrics.output_token_count.successful.mean:.1f}",
-                    f"{benchmark.metrics.output_token_count.incomplete.mean:.1f}",
-                    f"{benchmark.metrics.output_token_count.errored.mean:.1f}",
-                    f"{benchmark.metrics.prompt_token_count.successful.total_sum:.0f}",
-                    f"{benchmark.metrics.prompt_token_count.incomplete.total_sum:.0f}",
-                    f"{benchmark.metrics.prompt_token_count.errored.total_sum:.0f}",
-                    f"{benchmark.metrics.output_token_count.successful.total_sum:.0f}",
-                    f"{benchmark.metrics.output_token_count.incomplete.total_sum:.0f}",
-                    f"{benchmark.metrics.output_token_count.errored.total_sum:.0f}",
-                ]
-            )
-
-        self._print_table(headers, rows, "Benchmarks Info", sections)
-
-    def _print_benchmarks_stats(self, benchmarks: list[GenerativeBenchmark]):
-        sections = {
-            "Metadata": (0, 0),
-            "Request Stats": (1, 2),
-            "Out Tok/sec": (3, 3),
-            "Tot Tok/sec": (4, 4),
-            "Req Latency (sec)": (5, 7),
-            "TTFT (ms)": (8, 10),
-            "ITL (ms)": (11, 13),
-            "TPOT (ms)": (14, 16),
-        }
-        headers = [
-            "Benchmark",
-            "Per Second",
-            "Concurrency",
-            "mean",
-            "mean",
-            "mean",
-            "median",
-            "p99",
-            "mean",
-            "median",
-            "p99",
-            "mean",
-            "median",
-            "p99",
-            "mean",
-            "median",
-            "p99",
-        ]
-
-        rows = []
-        for benchmark in benchmarks:
-            rows.append(
-                [
-                    str(benchmark.scheduler.strategy),
-                    f"{benchmark.metrics.requests_per_second.successful.mean:.2f}",
-                    f"{benchmark.metrics.request_concurrency.successful.mean:.2f}",
-                    f"{benchmark.metrics.output_tokens_per_second.successful.mean:.1f}",
-                    f"{benchmark.metrics.tokens_per_second.successful.mean:.1f}",
-                    f"{benchmark.metrics.request_latency.successful.mean:.2f}",
-                    f"{benchmark.metrics.request_latency.successful.median:.2f}",
-                    f"{benchmark.metrics.request_latency.successful.percentiles.p99:.2f}",
-                    f"{benchmark.metrics.time_to_first_token_ms.successful.mean:.1f}",
-                    f"{benchmark.metrics.time_to_first_token_ms.successful.median:.1f}",
-                    f"{benchmark.metrics.time_to_first_token_ms.successful.percentiles.p99:.1f}",
-                    f"{benchmark.metrics.inter_token_latency_ms.successful.mean:.1f}",
-                    f"{benchmark.metrics.inter_token_latency_ms.successful.median:.1f}",
-                    f"{benchmark.metrics.inter_token_latency_ms.successful.percentiles.p99:.1f}",
-                    f"{benchmark.metrics.time_per_output_token_ms.successful.mean:.1f}",
-                    f"{benchmark.metrics.time_per_output_token_ms.successful.median:.1f}",
-                    f"{benchmark.metrics.time_per_output_token_ms.successful.percentiles.p99:.1f}",
-                ]
-            )
-
-        self._print_table(headers, rows, "Benchmarks Stats", sections)
-
-    def _get_profile_str(self, benchmark: GenerativeBenchmark) -> str:
-        profile = benchmark.benchmarker.profile
-        if profile is None:
-            return "None"
-
-        profile_args = OrderedDict(
-            {
-                "type": profile.type_,
-                "strategies": getattr(profile, "strategy_types", []),
-            }
-        )
-
-        if isinstance(profile, ConcurrentProfile):
-            profile_args["streams"] = str(profile.streams)
-        elif isinstance(profile, ThroughputProfile):
-            profile_args["max_concurrency"] = str(profile.max_concurrency)
-        elif isinstance(profile, AsyncProfile):
-            profile_args["max_concurrency"] = str(profile.max_concurrency)
-            profile_args["rate"] = str(profile.rate)
-        elif isinstance(profile, SweepProfile):
-            profile_args["sweep_size"] = str(profile.sweep_size)
-
-        return ", ".join(f"{key}={value}" for key, value in profile_args.items())
-
-    def _print_section_header(self, title: str, indent: int = 0, new_lines: int = 2):
-        self._print_line(
-            f"{title}:",
-            f"bold underline {Colors.info}",
-            indent=indent,
-            new_lines=new_lines,
-        )
-
-    def _print_labeled_line(
-        self, label: str, value: str, indent: int = 4, new_lines: int = 0
-    ):
-        self._print_line(
-            [label + ":", value],
-            ["bold " + Colors.info, "italic"],
-            new_lines=new_lines,
-            indent=indent,
-        )
-
-    def _print_line(
-        self,
-        value: str | list[str],
-        style: str | list[str] = "",
-        indent: int = 0,
-        new_lines: int = 0,
-    ):
-        text = Text()
-        for _ in range(new_lines):
-            text.append("\n")
-
-        if not isinstance(value, list):
-            value = [value]
-        if not isinstance(style, list):
-            style = [style for _ in range(len(value))]
-
-        if len(value) != len(style):
-            raise ValueError(
-                f"Value and style length mismatch: {len(value)} vs {len(style)}"
-            )
-
-        for val, sty in zip(value, style, strict=False):
-            text.append(val, style=sty)
-
-        self.console.print(Padding.indent(text, indent))
-
-    def _print_table(
-        self,
-        headers: list[str],
-        rows: list[list[Any]],
-        title: str,
-        sections: dict[str, tuple[int, int]] | None = None,
-        max_char_per_col: int = 1024,
-        indent: int = 0,
-        new_lines: int = 2,
-    ):
-        if rows and any(len(row) != len(headers) for row in rows):
-            raise ValueError(
-                "Headers and rows length mismatch: "
-                f"{len(headers)} vs {len(rows[0]) if rows else 'N/A'}"
-            )
-
-        max_chars_per_column = self._calculate_max_chars_per_column(
-            headers, rows, sections, max_char_per_col
-        )
-
-        self._print_section_header(title, indent=indent, new_lines=new_lines)
-        self._print_table_divider(max_chars_per_column, False, indent)
-        if sections:
-            self._print_table_sections(sections, max_chars_per_column, indent)
-        self._print_table_row(
-            split_text_list_by_length(headers, max_chars_per_column),
-            f"bold {Colors.info}",
-            indent,
-        )
-        self._print_table_divider(max_chars_per_column, True, indent)
-        for row in rows:
-            self._print_table_row(
-                split_text_list_by_length(row, max_chars_per_column),
-                "italic",
-                indent,
-            )
-        self._print_table_divider(max_chars_per_column, False, indent)
-
-    def _calculate_max_chars_per_column(
-        self,
-        headers: list[str],
-        rows: list[list[Any]],
-        sections: dict[str, tuple[int, int]] | None,
-        max_char_per_col: int,
-    ) -> list[int]:
-        """Calculate maximum characters per column for table formatting."""
-        max_chars_per_column = []
-        for ind in range(len(headers)):
-            max_chars_per_column.append(min(len(headers[ind]), max_char_per_col))
-            for row in rows:
-                max_chars_per_column[ind] = max(
-                    max_chars_per_column[ind], len(str(row[ind]))
-                )
-
-        if not sections:
-            return max_chars_per_column
-
-        for section, (start_col, end_col) in sections.items():
-            min_section_len = len(section) + (end_col - start_col)
-            chars_in_columns = sum(
-                max_chars_per_column[start_col : end_col + 1]
-            ) + 2 * (end_col - start_col)
-            if min_section_len > chars_in_columns:
-                add_chars_per_col = math.ceil(
-                    (min_section_len - chars_in_columns) / (end_col - start_col + 1)
-                )
-                for col in range(start_col, end_col + 1):
-                    max_chars_per_column[col] += add_chars_per_col
-
-        return max_chars_per_column
-
-    def _print_table_divider(
-        self, max_chars_per_column: list[int], include_separators: bool, indent: int = 0
-    ):
-        """Print table divider line."""
-        if include_separators:
-            columns = [
-                settings.table_headers_border_char * max_chars
-                + settings.table_column_separator_char
-                + settings.table_headers_border_char
-                for max_chars in max_chars_per_column
-            ]
-        else:
-            columns = [
-                settings.table_border_char * (max_chars + 2)
-                for max_chars in max_chars_per_column
-            ]
-        columns[-1] = columns[-1][:-2]
-        self._print_line(columns, Colors.info, indent)
-
-    def _print_table_sections(
-        self,
-        sections: dict[str, tuple[int, int]],
-        max_chars_per_column: list[int],
-        indent: int = 0,
-    ):
-        section_tuples = [(start, end, name) for name, (start, end) in sections.items()]
-        section_tuples.sort(key=lambda x: x[0])
-
-        if any(start > end for start, end, _ in section_tuples):
-            raise ValueError(f"Invalid section ranges: {section_tuples}")
-
-        if (
-            any(
-                section_tuples[ind][1] + 1 != section_tuples[ind + 1][0]
-                for ind in range(len(section_tuples) - 1)
-            )
-            or section_tuples[0][0] != 0
-            or section_tuples[-1][1] != len(max_chars_per_column) - 1
-        ):
-            raise ValueError(f"Invalid section ranges: {section_tuples}")
-
-        line_values = []
-        line_styles = []
-        for section, (start_col, end_col) in sections.items():
-            section_length = sum(max_chars_per_column[start_col : end_col + 1]) + 2 * (
-                end_col - start_col + 1
-            )
-            num_separators = end_col - start_col
-            line_values.extend(
-                [
-                    section,
-                    " " * (section_length - len(section) - num_separators - 2),
-                    settings.table_column_separator_char * num_separators,
-                    settings.table_column_separator_char + " ",
-                ]
-            )
-            line_styles.extend(["bold " + Colors.info, "", "", Colors.info])
-
-        line_values = line_values[:-1]
-        line_styles = line_styles[:-1]
-        self._print_line(line_values, line_styles, indent)
-
-    def _print_table_row(
-        self, column_lines: list[list[str]], style: str, indent: int = 0
-    ):
-        for row in range(len(column_lines[0])):
-            print_line = []
-            print_styles = []
-            for column in range(len(column_lines)):
-                print_line.extend(
-                    [
-                        column_lines[column][row],
-                        settings.table_column_separator_char,
-                        " ",
-                    ]
-                )
-                print_styles.extend([style, Colors.info, ""])
-            print_line = print_line[:-2]
-            print_styles = print_styles[:-2]
-            self._print_line(print_line, print_styles, indent)
-
-
-@GenerativeBenchmarkerOutput.register("csv")
-class GenerativeBenchmarkerCSV(GenerativeBenchmarkerOutput):
-    """CSV output formatter for benchmark results."""
-
-    DEFAULT_FILE: ClassVar[str] = "benchmarks.csv"
-
-    @classmethod
-    def validated_kwargs(
-        cls, output_path: str | Path | None, **_kwargs
-    ) -> dict[str, Any]:
-        new_kwargs = {}
-        if output_path is not None:
-            new_kwargs["output_path"] = (
-                Path(output_path) if not isinstance(output_path, Path) else output_path
-            )
-        return new_kwargs
-
-    output_path: Path = Field(default_factory=lambda: Path.cwd())
-
-    async def finalize(self, report: GenerativeBenchmarksReport) -> Path:
-        """
-        Save the benchmark report as a CSV file.
-
-        :param report: The completed benchmark report.
-        :return: Path to the saved CSV file.
-        """
-        output_path = self.output_path
-        if output_path.is_dir():
-            output_path = output_path / GenerativeBenchmarkerCSV.DEFAULT_FILE
-        output_path.parent.mkdir(parents=True, exist_ok=True)
-
-        with output_path.open("w", newline="") as file:
-            writer = csv.writer(file)
-            headers: list[str] = []
-            rows: list[list[str | float | list[float]]] = []
-
-            for benchmark in report.benchmarks:
-                benchmark_headers: list[str] = []
-                benchmark_values: list[str | float | list[float]] = []
-
-                # Add basic run description info
-                desc_headers, desc_values = self._get_benchmark_desc_headers_and_values(
-                    benchmark
-                )
-                benchmark_headers.extend(desc_headers)
-                benchmark_values.extend(desc_values)
-
-                # Add status-based metrics
-                for status in StatusDistributionSummary.model_fields:
-                    status_headers, status_values = (
-                        self._get_benchmark_status_headers_and_values(benchmark, status)
-                    )
-                    benchmark_headers.extend(status_headers)
-                    benchmark_values.extend(status_values)
-
-                # Add extra fields
-                extras_headers, extras_values = (
-                    self._get_benchmark_extras_headers_and_values(benchmark)
-                )
-                benchmark_headers.extend(extras_headers)
-                benchmark_values.extend(extras_values)
-
-                if not headers:
-                    headers = benchmark_headers
-                rows.append(benchmark_values)
-
-            writer.writerow(headers)
-            for row in rows:
-                writer.writerow(row)
-
-        return output_path
-
-    def _get_benchmark_desc_headers_and_values(
-        self, benchmark: GenerativeBenchmark
-    ) -> tuple[list[str], list[str | float]]:
-        """Get description headers and values for a benchmark."""
-        headers = [
-            "Type",
-            "Run Id",
-            "Id",
-            "Name",
-            "Start Time",
-            "End Time",
-            "Duration",
-        ]
-        values: list[str | float] = [
-            benchmark.type_,
-            benchmark.run_id,
-            benchmark.id_,
-            str(benchmark.scheduler.strategy),
-            datetime.fromtimestamp(benchmark.start_time).strftime("%Y-%m-%d %H:%M:%S"),
-            datetime.fromtimestamp(benchmark.end_time).strftime("%Y-%m-%d %H:%M:%S"),
-            benchmark.duration,
-        ]
-        return headers, values
-
-    def _get_benchmark_status_headers_and_values(
-        self, benchmark: GenerativeBenchmark, status: str
-    ) -> tuple[list[str], list[float | list[float]]]:
-        """Get status-based metrics headers and values for a benchmark."""
-        headers = [f"{status.capitalize()} Requests"]
-        values = [getattr(benchmark.request_totals, status)]
-
-        for metric in GenerativeMetrics.model_fields:
-            metric_headers, metric_values = self._get_benchmark_status_metrics_stats(
-                benchmark, status, metric
-            )
-            headers.extend(metric_headers)
-            values.extend(metric_values)
-
-        return headers, values
-
-    def _get_benchmark_status_metrics_stats(
-        self, benchmark: GenerativeBenchmark, status: str, metric: str
-    ) -> tuple[list[str], list[float | list[float]]]:
-        """Get statistical metrics for a specific status and metric."""
-        status_display = status.capitalize()
-        metric_display = metric.replace("_", " ").capitalize()
-        status_dist_summary: StatusDistributionSummary = getattr(
-            benchmark.metrics, metric
-        )
-        if not hasattr(status_dist_summary, status):
-            return [], []
-        dist_summary: DistributionSummary = getattr(status_dist_summary, status)
-
-        headers = [
-            f"{status_display} {metric_display} mean",
-            f"{status_display} {metric_display} median",
-            f"{status_display} {metric_display} std dev",
-            (
-                f"{status_display} {metric_display} "
-                "[min, 0.1, 1, 5, 10, 25, 75, 90, 95, 99, max]"
-            ),
-        ]
-        values: list[float | list[float]] = [
-            dist_summary.mean,
-            dist_summary.median,
-            dist_summary.std_dev,
-            [
-                dist_summary.min,
-                dist_summary.percentiles.p001,
-                dist_summary.percentiles.p01,
-                dist_summary.percentiles.p05,
-                dist_summary.percentiles.p10,
-                dist_summary.percentiles.p25,
-                dist_summary.percentiles.p75,
-                dist_summary.percentiles.p90,
-                dist_summary.percentiles.p95,
-                dist_summary.percentiles.p99,
-                dist_summary.max,
-            ],
-        ]
-        return headers, values
-
-    def _get_benchmark_extras_headers_and_values(
-        self,
-        benchmark: GenerativeBenchmark,
-    ) -> tuple[list[str], list[str]]:
-        headers = ["Profile", "Backend", "Generator Data"]
-        values: list[str] = [
-            benchmark.benchmarker.profile.model_dump_json(),
-            json.dumps(benchmark.benchmarker.backend),
-            json.dumps(benchmark.benchmarker.requests["data"]),
-        ]
-
-        if len(headers) != len(values):
-            raise ValueError("Headers and values length mismatch.")
-
-        return headers, values
-
-
-@GenerativeBenchmarkerOutput.register("html")
-class GenerativeBenchmarkerHTML(GenerativeBenchmarkerOutput):
-    """HTML output formatter for benchmark results."""
-
-    DEFAULT_FILE: ClassVar[str] = "benchmarks.html"
-
-    @classmethod
-    def validated_kwargs(
-        cls, output_path: str | Path | None, **_kwargs
-    ) -> dict[str, Any]:
-        new_kwargs = {}
-        if output_path is not None:
-            new_kwargs["output_path"] = (
-                Path(output_path) if not isinstance(output_path, Path) else output_path
-            )
-        return new_kwargs
-
-    output_path: Path = Field(default_factory=lambda: Path.cwd())
-
-    async def finalize(self, report: GenerativeBenchmarksReport) -> Path:
-        """
-        Save the benchmark report as an HTML file.
-
-        :param report: The completed benchmark report.
-        :return: Path to the saved HTML file.
-        """
-        output_path = self.output_path
-        if output_path.is_dir():
-            output_path = output_path / GenerativeBenchmarkerHTML.DEFAULT_FILE
-        output_path.parent.mkdir(parents=True, exist_ok=True)
-
-        data_builder = UIDataBuilder(report.benchmarks)
-        data = data_builder.to_dict()
-        camel_data = recursive_key_update(deepcopy(data), camelize_str)
-
-        ui_api_data = {}
-        for k, v in camel_data.items():
-            placeholder_key = f"window.{k} = {{}};"
-            replacement_value = f"window.{k} = {json.dumps(v, indent=2)};\n"
-            ui_api_data[placeholder_key] = replacement_value
-
-        create_report(ui_api_data, output_path)
-
-        return output_path
diff --git a/src/guidellm/benchmark/outputs/__init__.py b/src/guidellm/benchmark/outputs/__init__.py
new file mode 100644
index 00000000..2e321605
--- /dev/null
+++ b/src/guidellm/benchmark/outputs/__init__.py
@@ -0,0 +1,24 @@
+"""
+Output formatters for benchmark results.
+
+Provides output formatter implementations that transform benchmark reports into
+various file formats including JSON, CSV, HTML, and console display. All formatters
+extend the base GenerativeBenchmarkerOutput interface, enabling dynamic resolution
+and flexible output configuration for benchmark result persistence and analysis.
+"""
+
+from __future__ import annotations
+
+from .console import GenerativeBenchmarkerConsole
+from .csv import GenerativeBenchmarkerCSV
+from .html import GenerativeBenchmarkerHTML
+from .output import GenerativeBenchmarkerOutput
+from .serialized import GenerativeBenchmarkerSerialized
+
+__all__ = [
+    "GenerativeBenchmarkerCSV",
+    "GenerativeBenchmarkerConsole",
+    "GenerativeBenchmarkerHTML",
+    "GenerativeBenchmarkerOutput",
+    "GenerativeBenchmarkerSerialized",
+]
diff --git a/src/guidellm/benchmark/outputs/console.py b/src/guidellm/benchmark/outputs/console.py
new file mode 100644
index 00000000..2dc8ce3c
--- /dev/null
+++ b/src/guidellm/benchmark/outputs/console.py
@@ -0,0 +1,620 @@
+"""
+Console output formatter for generative benchmarker results.
+
+This module provides console-based output formatting for benchmark reports, organizing
+metrics into structured tables that display request statistics, latency measurements,
+throughput data, and modality-specific metrics (text, image, video, audio). It uses
+the Console utility to render multi-column tables with proper alignment and formatting
+for terminal display.
+"""
+
+from __future__ import annotations
+
+from collections import defaultdict
+from dataclasses import dataclass, field
+from typing import Any, Literal, cast
+
+from pydantic import Field
+
+from guidellm.benchmark.outputs.output import GenerativeBenchmarkerOutput
+from guidellm.benchmark.schemas import GenerativeBenchmarksReport
+from guidellm.schemas import DistributionSummary, StatusDistributionSummary
+from guidellm.utils import Console, safe_format_number, safe_format_timestamp
+
+__all__ = ["GenerativeBenchmarkerConsole"]
+
+
+@dataclass
+class ConsoleTableColumn:
+    """
+    Data structure for a single console table column.
+
+    Stores column metadata (group, name, units, type) and accumulated values for
+    rendering formatted table output with proper type-specific formatting and precision.
+
+    :cvar group: Optional group header for related columns
+    :cvar name: Column name displayed in header
+    :cvar units: Optional unit label for numeric values
+    :cvar type_: Data type determining formatting (number, text, timestamp)
+    :cvar precision: Decimal precision for numeric formatting
+    :cvar values: Accumulated values for this column across rows
+    """
+
+    group: str | None = None
+    name: str | None = None
+    units: str | None = None
+    type_: Literal["number", "text", "timestamp"] = "number"
+    precision: int = 1
+    values: list[str | float | int | None] = field(default_factory=list)
+
+
+class ConsoleTableColumnsCollection(dict[str, ConsoleTableColumn]):
+    """
+    Collection manager for console table columns.
+
+    Extends dict to provide specialized methods for adding values and statistics to
+    columns, automatically creating columns as needed and organizing them by composite
+    keys for consistent table rendering.
+    """
+
+    def add_value(
+        self,
+        value: str | float | int | None,
+        group: str | None = None,
+        name: str | None = None,
+        units: str | None = None,
+        type_: Literal["number", "text", "timestamp"] = "number",
+        precision: int = 1,
+    ):
+        """
+        Add a value to a column, creating the column if it doesn't exist.
+
+        :param value: The value to add to the column
+        :param group: Optional group header for the column
+        :param name: Column name for display
+        :param units: Optional unit label
+        :param type_: Data type for formatting
+        :param precision: Decimal precision for numbers
+        """
+        key = f"{group}_{name}_{units}"
+
+        if key not in self:
+            self[key] = ConsoleTableColumn(
+                group=group, name=name, units=units, type_=type_, precision=precision
+            )
+
+        self[key].values.append(value)
+
+    def add_stats(
+        self,
+        stats: StatusDistributionSummary | None,
+        status: Literal["successful", "incomplete", "errored", "total"] = "successful",
+        group: str | None = None,
+        name: str | None = None,
+        precision: int = 1,
+    ):
+        """
+        Add statistical summary columns (mean and p95) for a metric.
+
+        Creates paired mean/p95 columns automatically and appends values from the
+        specified status category of the distribution summary.
+
+        :param stats: Distribution summary containing status-specific statistics
+        :param status: Status category to extract statistics from
+        :param group: Optional group header for the columns
+        :param name: Column name for display
+        :param precision: Decimal precision for numbers
+        """
+        key = f"{group}_{name}"
+
+        if f"{key}_mean" not in self:
+            self[f"{key}_mean"] = ConsoleTableColumn(
+                group=group, name=name, units="Mean", precision=precision
+            )
+            self[f"{key}_p95"] = ConsoleTableColumn(
+                group=group, name=name, units="p95", precision=precision
+            )
+
+        status_stats: DistributionSummary | None = (
+            getattr(stats, status) if stats else None
+        )
+        self[f"{key}_mean"].values.append(status_stats.mean if status_stats else None)
+        self[f"{key}_p95"].values.append(
+            status_stats.percentiles.p95 if status_stats else None
+        )
+
+    def get_table_data(self) -> tuple[list[list[str]], list[list[str]]]:
+        """
+        Convert column collection to formatted table data.
+
+        Transforms stored columns and values into header and value lists suitable for
+        console table rendering, applying type-specific formatting.
+
+        :return: Tuple of (headers, values) where each is a list of column string lists
+        """
+        headers: list[list[str]] = []
+        values: list[list[str]] = []
+
+        for column in self.values():
+            headers.append([column.group or "", column.name or "", column.units or ""])
+            formatted_values: list[str] = []
+            for value in column.values:
+                if column.type_ == "text":
+                    formatted_values.append(str(value))
+                    continue
+
+                if not isinstance(value, float | int) and value is not None:
+                    raise ValueError(
+                        f"Expected numeric value for column '{column.name}', "
+                        f"got: {value}"
+                    )
+
+                if column.type_ == "timestamp":
+                    formatted_values.append(
+                        safe_format_timestamp(cast("float | None", value))
+                    )
+                elif column.type_ == "number":
+                    formatted_values.append(
+                        safe_format_number(
+                            value,
+                            precision=column.precision,
+                        )
+                    )
+                else:
+                    raise ValueError(f"Unsupported column type: {column.type_}")
+            values.append(formatted_values)
+
+        return headers, values
+
+
+@GenerativeBenchmarkerOutput.register("console")
+class GenerativeBenchmarkerConsole(GenerativeBenchmarkerOutput):
+    """
+    Console output formatter for benchmark reports.
+
+    Renders benchmark results as formatted tables in the terminal, organizing metrics
+    by category (run summary, request counts, latency, throughput, modality-specific)
+    with proper alignment and type-specific formatting for readability.
+    """
+
+    @classmethod
+    def validated_kwargs(cls, *_args, **_kwargs) -> dict[str, Any]:
+        """
+        Validate and return keyword arguments for initialization.
+
+        :return: Empty dict as no additional kwargs are required
+        """
+        return {}
+
+    console: Console = Field(
+        default_factory=Console,
+        description="Console utility for rendering formatted tables",
+    )
+
+    async def finalize(self, report: GenerativeBenchmarksReport) -> str:
+        """
+        Print the complete benchmark report to the console.
+
+        Renders all metric tables including run summary, request counts, latency,
+        throughput, and modality-specific statistics to the console.
+
+        :param report: The completed benchmark report
+        :return: Status message indicating output location
+        """
+        self.print_run_summary_table(report)
+        self.print_text_table(report)
+        self.print_image_table(report)
+        self.print_video_table(report)
+        self.print_audio_table(report)
+        self.print_request_counts_table(report)
+        self.print_request_latency_table(report)
+        self.print_server_throughput_table(report)
+
+        return "printed to console"
+
+    def print_run_summary_table(self, report: GenerativeBenchmarksReport):
+        """
+        Print the run summary table with timing and token information.
+
+        :param report: The benchmark report containing run metadata
+        """
+        columns = ConsoleTableColumnsCollection()
+
+        for benchmark in report.benchmarks:
+            columns.add_value(
+                benchmark.config.strategy.type_,
+                group="Benchmark",
+                name="Strategy",
+                type_="text",
+            )
+            columns.add_value(
+                benchmark.start_time, group="Timings", name="Start", type_="timestamp"
+            )
+            columns.add_value(
+                benchmark.end_time, group="Timings", name="End", type_="timestamp"
+            )
+            columns.add_value(
+                benchmark.duration, group="Timings", name="Dur", units="Sec"
+            )
+            columns.add_value(
+                report.args.warmup, group="Timings", name="Warm", units="Sec"
+            )
+            columns.add_value(
+                report.args.cooldown, group="Timings", name="Cool", units="Sec"
+            )
+
+            for token_metrics, group in [
+                (benchmark.metrics.prompt_token_count, "Input Tokens"),
+                (benchmark.metrics.output_token_count, "Output Tokens"),
+            ]:
+                columns.add_value(
+                    token_metrics.successful.total_sum,
+                    group=group,
+                    name="Comp",
+                    units="Tot",
+                )
+                columns.add_value(
+                    token_metrics.incomplete.total_sum,
+                    group=group,
+                    name="Inc",
+                    units="Tot",
+                )
+                columns.add_value(
+                    token_metrics.errored.total_sum,
+                    group=group,
+                    name="Err",
+                    units="Tot",
+                )
+
+        headers, values = columns.get_table_data()
+        self.console.print("\n")
+        self.console.print_table(headers, values, title="Run Summary Info")
+
+    def print_text_table(self, report: GenerativeBenchmarksReport):
+        """
+        Print text-specific metrics table if any text data exists.
+
+        :param report: The benchmark report containing text metrics
+        """
+        self._print_modality_table(
+            report=report,
+            modality="text",
+            title="Text Metrics Statistics (Completed Requests)",
+            metric_groups=[
+                ("tokens", "Tokens"),
+                ("words", "Words"),
+                ("characters", "Characters"),
+            ],
+        )
+
+    def print_image_table(self, report: GenerativeBenchmarksReport):
+        """
+        Print image-specific metrics table if any image data exists.
+
+        :param report: The benchmark report containing image metrics
+        """
+        self._print_modality_table(
+            report=report,
+            modality="image",
+            title="Image Metrics Statistics (Completed Requests)",
+            metric_groups=[
+                ("tokens", "Tokens"),
+                ("images", "Images"),
+                ("pixels", "Pixels"),
+                ("bytes", "Bytes"),
+            ],
+        )
+
+    def print_video_table(self, report: GenerativeBenchmarksReport):
+        """
+        Print video-specific metrics table if any video data exists.
+
+        :param report: The benchmark report containing video metrics
+        """
+        self._print_modality_table(
+            report=report,
+            modality="video",
+            title="Video Metrics Statistics (Completed Requests)",
+            metric_groups=[
+                ("tokens", "Tokens"),
+                ("frames", "Frames"),
+                ("seconds", "Seconds"),
+                ("bytes", "Bytes"),
+            ],
+        )
+
+    def print_audio_table(self, report: GenerativeBenchmarksReport):
+        """
+        Print audio-specific metrics table if any audio data exists.
+
+        :param report: The benchmark report containing audio metrics
+        """
+        self._print_modality_table(
+            report=report,
+            modality="audio",
+            title="Audio Metrics Statistics (Completed Requests)",
+            metric_groups=[
+                ("tokens", "Tokens"),
+                ("samples", "Samples"),
+                ("seconds", "Seconds"),
+                ("bytes", "Bytes"),
+            ],
+        )
+
+    def print_request_counts_table(self, report: GenerativeBenchmarksReport):
+        """
+        Print request token count statistics table.
+
+        :param report: The benchmark report containing request count metrics
+        """
+        columns = ConsoleTableColumnsCollection()
+
+        for benchmark in report.benchmarks:
+            columns.add_value(
+                benchmark.config.strategy.type_,
+                group="Benchmark",
+                name="Strategy",
+                type_="text",
+            )
+            columns.add_stats(
+                benchmark.metrics.prompt_token_count,
+                group="Input Tok",
+                name="Per Req",
+            )
+            columns.add_stats(
+                benchmark.metrics.output_token_count,
+                group="Output Tok",
+                name="Per Req",
+            )
+            columns.add_stats(
+                benchmark.metrics.total_token_count,
+                group="Total Tok",
+                name="Per Req",
+            )
+            columns.add_stats(
+                benchmark.metrics.request_streaming_iterations_count,
+                group="Stream Iter",
+                name="Per Req",
+            )
+            columns.add_stats(
+                benchmark.metrics.output_tokens_per_iteration,
+                group="Output Tok",
+                name="Per Stream Iter",
+            )
+
+        headers, values = columns.get_table_data()
+        self.console.print("\n")
+        self.console.print_table(
+            headers,
+            values,
+            title="Request Token Statistics (Completed Requests)",
+        )
+
+    def print_request_latency_table(self, report: GenerativeBenchmarksReport):
+        """
+        Print request latency metrics table.
+
+        :param report: The benchmark report containing latency metrics
+        """
+        columns = ConsoleTableColumnsCollection()
+
+        for benchmark in report.benchmarks:
+            columns.add_value(
+                benchmark.config.strategy.type_,
+                group="Benchmark",
+                name="Strategy",
+                type_="text",
+            )
+            columns.add_stats(
+                benchmark.metrics.request_latency,
+                group="Request Latency",
+                name="Sec",
+            )
+            columns.add_stats(
+                benchmark.metrics.time_to_first_token_ms,
+                group="TTFT",
+                name="ms",
+            )
+            columns.add_stats(
+                benchmark.metrics.inter_token_latency_ms,
+                group="ITL",
+                name="ms",
+            )
+            columns.add_stats(
+                benchmark.metrics.time_per_output_token_ms,
+                group="TPOT",
+                name="ms",
+            )
+
+        headers, values = columns.get_table_data()
+        self.console.print("\n")
+        self.console.print_table(
+            headers,
+            values,
+            title="Request Latency Statistics (Completed Requests)",
+        )
+
+    def print_server_throughput_table(self, report: GenerativeBenchmarksReport):
+        """
+        Print server throughput metrics table.
+
+        :param report: The benchmark report containing throughput metrics
+        """
+        columns = ConsoleTableColumnsCollection()
+
+        for benchmark in report.benchmarks:
+            columns.add_value(
+                benchmark.config.strategy.type_,
+                group="Benchmark",
+                name="Strategy",
+                type_="text",
+            )
+            columns.add_stats(
+                benchmark.metrics.requests_per_second,
+                group="Requests",
+                name="Per Sec",
+            )
+            columns.add_stats(
+                benchmark.metrics.request_concurrency,
+                group="Requests",
+                name="Concurrency",
+            )
+            columns.add_stats(
+                benchmark.metrics.prompt_tokens_per_second,
+                group="Input Tokens",
+                name="Per Sec",
+            )
+            columns.add_stats(
+                benchmark.metrics.output_tokens_per_second,
+                group="Output Tokens",
+                name="Per Sec",
+            )
+            columns.add_stats(
+                benchmark.metrics.tokens_per_second,
+                group="Total Tokens",
+                name="Per Sec",
+            )
+
+        headers, values = columns.get_table_data()
+        self.console.print("\n")
+        self.console.print_table(headers, values, title="Server Throughput Statistics")
+
+    def _print_modality_table(
+        self,
+        report: GenerativeBenchmarksReport,
+        modality: Literal["text", "image", "video", "audio"],
+        title: str,
+        metric_groups: list[tuple[str, str]],
+    ):
+        columns: dict[str, ConsoleTableColumnsCollection] = defaultdict(
+            ConsoleTableColumnsCollection
+        )
+
+        for benchmark in report.benchmarks:
+            columns["labels"].add_value(
+                benchmark.config.strategy.type_,
+                group="Benchmark",
+                name="Strategy",
+                type_="text",
+            )
+
+            modality_metrics = getattr(benchmark.metrics, modality)
+
+            for metric_attr, display_name in metric_groups:
+                metric_obj = getattr(modality_metrics, metric_attr, None)
+                self._add_input_output_stats(
+                    columns=columns,
+                    metric_obj=metric_obj,
+                    metric_key=metric_attr,
+                    display_name=display_name,
+                )
+
+        self._print_inp_out_tables(
+            title=title,
+            labels=columns["labels"],
+            groups=[
+                (columns[f"{metric_attr}.input"], columns[f"{metric_attr}.output"])
+                for metric_attr, _ in metric_groups
+            ],
+        )
+
+    def _print_inp_out_tables(
+        self,
+        title: str,
+        labels: ConsoleTableColumnsCollection,
+        groups: list[
+            tuple[ConsoleTableColumnsCollection, ConsoleTableColumnsCollection]
+        ],
+    ):
+        input_headers, input_values = [], []
+        output_headers, output_values = [], []
+        input_has_data = False
+        output_has_data = False
+
+        for input_columns, output_columns in groups:
+            # Check if columns have any non-None values
+            type_input_has_data = any(
+                any(value is not None for value in column.values)
+                for column in input_columns.values()
+            )
+            type_output_has_data = any(
+                any(value is not None for value in column.values)
+                for column in output_columns.values()
+            )
+
+            if not (type_input_has_data or type_output_has_data):
+                continue
+
+            input_has_data = input_has_data or type_input_has_data
+            output_has_data = output_has_data or type_output_has_data
+
+            input_type_headers, input_type_columns = input_columns.get_table_data()
+            output_type_headers, output_type_columns = output_columns.get_table_data()
+
+            input_headers.extend(input_type_headers)
+            input_values.extend(input_type_columns)
+            output_headers.extend(output_type_headers)
+            output_values.extend(output_type_columns)
+
+        if not (input_has_data or output_has_data):
+            return
+
+        labels_headers, labels_values = labels.get_table_data()
+        header_cols_groups = []
+        value_cols_groups = []
+
+        if input_has_data:
+            header_cols_groups.append(labels_headers + input_headers)
+            value_cols_groups.append(labels_values + input_values)
+        if output_has_data:
+            header_cols_groups.append(labels_headers + output_headers)
+            value_cols_groups.append(labels_values + output_values)
+
+        if header_cols_groups and value_cols_groups:
+            self.console.print("\n")
+            self.console.print_tables(
+                header_cols_groups=header_cols_groups,
+                value_cols_groups=value_cols_groups,
+                title=title,
+            )
+
+    def _add_input_output_stats(
+        self,
+        columns: dict[str, ConsoleTableColumnsCollection],
+        metric_obj: Any,
+        metric_key: str,
+        display_name: str,
+    ):
+        input_stats: StatusDistributionSummary | None = (
+            getattr(metric_obj, "input", None) if metric_obj else None
+        )
+        input_per_second_stats: StatusDistributionSummary | None = (
+            getattr(metric_obj, "input_per_second", None) if metric_obj else None
+        )
+        output_stats: StatusDistributionSummary | None = (
+            getattr(metric_obj, "output", None) if metric_obj else None
+        )
+        output_per_second_stats: StatusDistributionSummary | None = (
+            getattr(metric_obj, "output_per_second", None) if metric_obj else None
+        )
+
+        columns[f"{metric_key}.input"].add_stats(
+            input_stats,
+            group=f"Input {display_name}",
+            name="Per Request",
+        )
+        columns[f"{metric_key}.input"].add_stats(
+            input_per_second_stats,
+            group=f"Input {display_name}",
+            name="Per Second",
+        )
+        columns[f"{metric_key}.output"].add_stats(
+            output_stats,
+            group=f"Output {display_name}",
+            name="Per Request",
+        )
+        columns[f"{metric_key}.output"].add_stats(
+            output_per_second_stats,
+            group=f"Output {display_name}",
+            name="Per Second",
+        )
diff --git a/src/guidellm/benchmark/outputs/csv.py b/src/guidellm/benchmark/outputs/csv.py
new file mode 100644
index 00000000..c1ea2479
--- /dev/null
+++ b/src/guidellm/benchmark/outputs/csv.py
@@ -0,0 +1,692 @@
+"""
+CSV output formatter for benchmark results.
+
+This module provides the GenerativeBenchmarkerCSV class which exports benchmark
+reports to CSV format with comprehensive metrics including timing, throughput,
+latency, modality data, and scheduler information. The CSV output uses multi-row
+headers to organize metrics hierarchically and includes both summary statistics
+and distribution percentiles.
+"""
+
+from __future__ import annotations
+
+import csv
+import json
+from pathlib import Path
+from typing import Annotated, Any, ClassVar, Literal
+
+from pydantic import Field
+
+from guidellm.benchmark.outputs.output import GenerativeBenchmarkerOutput
+from guidellm.benchmark.schemas import GenerativeBenchmark, GenerativeBenchmarksReport
+from guidellm.schemas import DistributionSummary, StatusDistributionSummary
+from guidellm.utils import safe_format_timestamp
+
+__all__ = ["GenerativeBenchmarkerCSV"]
+
+TIMESTAMP_FORMAT: Annotated[str, "Format string for timestamp output in CSV files"] = (
+    "%Y-%m-%d %H:%M:%S"
+)
+MODALITY_METRICS: Annotated[
+    dict[str, list[tuple[str, str]]],
+    "Mapping of modality types to their metric names and display labels",
+] = {
+    "text": [
+        ("tokens", "Tokens"),
+        ("words", "Words"),
+        ("characters", "Characters"),
+    ],
+    "image": [
+        ("tokens", "Tokens"),
+        ("images", "Images"),
+        ("pixels", "Pixels"),
+        ("bytes", "Bytes"),
+    ],
+    "video": [
+        ("tokens", "Tokens"),
+        ("frames", "Frames"),
+        ("seconds", "Seconds"),
+        ("bytes", "Bytes"),
+    ],
+    "audio": [
+        ("tokens", "Tokens"),
+        ("samples", "Samples"),
+        ("seconds", "Seconds"),
+        ("bytes", "Bytes"),
+    ],
+}
+
+
+@GenerativeBenchmarkerOutput.register("csv")
+class GenerativeBenchmarkerCSV(GenerativeBenchmarkerOutput):
+    """
+    CSV output formatter for benchmark results.
+
+    Exports comprehensive benchmark data to CSV format with multi-row headers
+    organizing metrics into categories including run information, timing, request
+    counts, latency, throughput, modality-specific data, and scheduler state. Each
+    benchmark run becomes a row with statistical distributions represented as
+    mean, median, standard deviation, and percentiles.
+
+    :cvar DEFAULT_FILE: Default filename for CSV output
+    """
+
+    DEFAULT_FILE: ClassVar[str] = "benchmarks.csv"
+
+    @classmethod
+    def validated_kwargs(
+        cls, output_path: str | Path | None, **_kwargs
+    ) -> dict[str, Any]:
+        """
+        Validate and normalize constructor keyword arguments.
+
+        :param output_path: Path for CSV output file or directory
+        :param _kwargs: Additional keyword arguments (ignored)
+        :return: Normalized keyword arguments dictionary
+        """
+        new_kwargs = {}
+        if output_path is not None:
+            new_kwargs["output_path"] = (
+                Path(output_path) if not isinstance(output_path, Path) else output_path
+            )
+        return new_kwargs
+
+    output_path: Path = Field(
+        default_factory=lambda: Path.cwd(),
+        description=(
+            "Path where the CSV file will be saved, defaults to current directory"
+        ),
+    )
+
+    async def finalize(self, report: GenerativeBenchmarksReport) -> Path:
+        """
+        Save the benchmark report as a CSV file.
+
+        :param report: The completed benchmark report
+        :return: Path to the saved CSV file
+        """
+        output_path = self.output_path
+        if output_path.is_dir():
+            output_path = output_path / GenerativeBenchmarkerCSV.DEFAULT_FILE
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+
+        with output_path.open("w", newline="") as file:
+            writer = csv.writer(file)
+            headers: list[list[str]] = []
+            rows: list[list[str | int | float]] = []
+
+            for benchmark in report.benchmarks:
+                benchmark_headers: list[list[str]] = []
+                benchmark_values: list[str | int | float] = []
+
+                self._add_run_info(benchmark, benchmark_headers, benchmark_values)
+                self._add_benchmark_info(benchmark, benchmark_headers, benchmark_values)
+                self._add_timing_info(benchmark, benchmark_headers, benchmark_values)
+                self._add_request_counts(benchmark, benchmark_headers, benchmark_values)
+                self._add_request_latency_metrics(
+                    benchmark, benchmark_headers, benchmark_values
+                )
+                self._add_server_throughput_metrics(
+                    benchmark, benchmark_headers, benchmark_values
+                )
+                for modality_name in ["text", "image", "video", "audio"]:
+                    self._add_modality_metrics(
+                        benchmark,
+                        modality_name,  # type: ignore[arg-type]
+                        benchmark_headers,
+                        benchmark_values,
+                    )
+                self._add_scheduler_info(benchmark, benchmark_headers, benchmark_values)
+
+                if not headers:
+                    headers = benchmark_headers
+                rows.append(benchmark_values)
+
+            self._write_multirow_header(writer, headers)
+            for row in rows:
+                writer.writerow(row)
+
+        return output_path
+
+    def _write_multirow_header(self, writer: Any, headers: list[list[str]]) -> None:
+        """
+        Write multi-row header to CSV for hierarchical metric organization.
+
+        :param writer: CSV writer instance
+        :param headers: List of column header hierarchies as string lists
+        """
+        max_rows = max((len(col) for col in headers), default=0)
+        for row_idx in range(max_rows):
+            row = [col[row_idx] if row_idx < len(col) else "" for col in headers]
+            writer.writerow(row)
+
+    def _add_field(
+        self,
+        headers: list[list[str]],
+        values: list[str | int | float],
+        group: str,
+        field_name: str,
+        value: Any,
+        units: str = "",
+    ) -> None:
+        """
+        Add a single field to headers and values lists.
+
+        :param headers: List of header hierarchies to append to
+        :param values: List of values to append to
+        :param group: Top-level category for the field
+        :param field_name: Name of the field
+        :param value: Value for the field
+        :param units: Optional units for the field
+        """
+        headers.append([group, field_name, units])
+        values.append(value)
+
+    def _add_run_info(
+        self,
+        benchmark: GenerativeBenchmark,
+        headers: list[list[str]],
+        values: list[str | int | float],
+    ) -> None:
+        """
+        Add overall run identification and configuration information.
+
+        :param benchmark: Benchmark data to extract run info from
+        :param headers: List of header hierarchies to append to
+        :param values: List of values to append to
+        """
+        self._add_field(headers, values, "Run Info", "Run ID", benchmark.config.run_id)
+        self._add_field(
+            headers, values, "Run Info", "Run Index", benchmark.config.run_index
+        )
+        self._add_field(
+            headers,
+            values,
+            "Run Info",
+            "Profile",
+            benchmark.config.profile.model_dump_json(),
+        )
+        self._add_field(
+            headers,
+            values,
+            "Run Info",
+            "Requests",
+            json.dumps(benchmark.config.requests),
+        )
+        self._add_field(
+            headers, values, "Run Info", "Backend", json.dumps(benchmark.config.backend)
+        )
+        self._add_field(
+            headers,
+            values,
+            "Run Info",
+            "Environment",
+            json.dumps(benchmark.config.environment),
+        )
+
+    def _add_benchmark_info(
+        self,
+        benchmark: GenerativeBenchmark,
+        headers: list[list[str]],
+        values: list[str | int | float],
+    ) -> None:
+        """
+        Add individual benchmark configuration details.
+
+        :param benchmark: Benchmark data to extract configuration from
+        :param headers: List of header hierarchies to append to
+        :param values: List of values to append to
+        """
+        self._add_field(headers, values, "Benchmark", "Type", benchmark.type_)
+        self._add_field(headers, values, "Benchmark", "ID", benchmark.config.id_)
+        self._add_field(
+            headers, values, "Benchmark", "Strategy", benchmark.config.strategy.type_
+        )
+        self._add_field(
+            headers,
+            values,
+            "Benchmark",
+            "Constraints",
+            json.dumps(benchmark.config.constraints),
+        )
+
+    def _add_timing_info(
+        self,
+        benchmark: GenerativeBenchmark,
+        headers: list[list[str]],
+        values: list[str | int | float],
+    ) -> None:
+        """
+        Add timing information including start, end, duration, warmup, and cooldown.
+
+        :param benchmark: Benchmark data to extract timing from
+        :param headers: List of header hierarchies to append to
+        :param values: List of values to append to
+        """
+        timing_fields: list[tuple[str, Any]] = [
+            ("Start Time", benchmark.scheduler_metrics.start_time),
+            ("Request Start Time", benchmark.scheduler_metrics.request_start_time),
+            ("Measure Start Time", benchmark.scheduler_metrics.measure_start_time),
+            ("Measure End Time", benchmark.scheduler_metrics.measure_end_time),
+            ("Request End Time", benchmark.scheduler_metrics.request_end_time),
+            ("End Time", benchmark.scheduler_metrics.end_time),
+        ]
+        for field_name, timestamp in timing_fields:
+            self._add_field(
+                headers,
+                values,
+                "Timings",
+                field_name,
+                safe_format_timestamp(timestamp, TIMESTAMP_FORMAT),
+            )
+
+        duration_fields: list[tuple[str, float]] = [
+            ("Duration", benchmark.duration),
+            ("Warmup", benchmark.config.warmup or 0.0),
+            ("Cooldown", benchmark.config.cooldown or 0.0),
+        ]
+        for field_name, duration_value in duration_fields:
+            self._add_field(
+                headers, values, "Timings", field_name, duration_value, "Sec"
+            )
+
+    def _add_request_counts(
+        self,
+        benchmark: GenerativeBenchmark,
+        headers: list[list[str]],
+        values: list[str | int | float],
+    ) -> None:
+        """
+        Add request count totals by status.
+
+        :param benchmark: Benchmark data to extract request counts from
+        :param headers: List of header hierarchies to append to
+        :param values: List of values to append to
+        """
+        for status in ["successful", "incomplete", "errored", "total"]:
+            self._add_field(
+                headers,
+                values,
+                "Request Counts",
+                status.capitalize(),
+                getattr(benchmark.metrics.request_totals, status),
+            )
+
+    def _add_request_latency_metrics(
+        self,
+        benchmark: GenerativeBenchmark,
+        headers: list[list[str]],
+        values: list[str | int | float],
+    ) -> None:
+        """
+        Add request latency and streaming metrics.
+
+        :param benchmark: Benchmark data to extract latency metrics from
+        :param headers: List of header hierarchies to append to
+        :param values: List of values to append to
+        """
+        self._add_stats_for_metric(
+            headers, values, benchmark.metrics.request_latency, "Request Latency", "Sec"
+        )
+        self._add_stats_for_metric(
+            headers,
+            values,
+            benchmark.metrics.request_streaming_iterations_count,
+            "Streaming Iterations",
+            "Count",
+        )
+        self._add_stats_for_metric(
+            headers,
+            values,
+            benchmark.metrics.time_to_first_token_ms,
+            "Time to First Token",
+            "ms",
+        )
+        self._add_stats_for_metric(
+            headers,
+            values,
+            benchmark.metrics.time_per_output_token_ms,
+            "Time per Output Token",
+            "ms",
+        )
+        self._add_stats_for_metric(
+            headers,
+            values,
+            benchmark.metrics.inter_token_latency_ms,
+            "Inter Token Latency",
+            "ms",
+        )
+
+    def _add_server_throughput_metrics(
+        self,
+        benchmark: GenerativeBenchmark,
+        headers: list[list[str]],
+        values: list[str | int | float],
+    ) -> None:
+        """
+        Add server throughput metrics including requests, tokens, and concurrency.
+
+        :param benchmark: Benchmark data to extract throughput metrics from
+        :param headers: List of header hierarchies to append to
+        :param values: List of values to append to
+        """
+        self._add_stats_for_metric(
+            headers,
+            values,
+            benchmark.metrics.requests_per_second,
+            "Server Throughput",
+            "Requests/Sec",
+        )
+        self._add_stats_for_metric(
+            headers,
+            values,
+            benchmark.metrics.request_concurrency,
+            "Server Throughput",
+            "Concurrency",
+        )
+        self._add_stats_for_metric(
+            headers,
+            values,
+            benchmark.metrics.prompt_token_count,
+            "Token Metrics",
+            "Input Tokens",
+        )
+        self._add_stats_for_metric(
+            headers,
+            values,
+            benchmark.metrics.output_token_count,
+            "Token Metrics",
+            "Output Tokens",
+        )
+        self._add_stats_for_metric(
+            headers,
+            values,
+            benchmark.metrics.total_token_count,
+            "Token Metrics",
+            "Total Tokens",
+        )
+        self._add_stats_for_metric(
+            headers,
+            values,
+            benchmark.metrics.prompt_tokens_per_second,
+            "Token Throughput",
+            "Input Tokens/Sec",
+        )
+        self._add_stats_for_metric(
+            headers,
+            values,
+            benchmark.metrics.output_tokens_per_second,
+            "Token Throughput",
+            "Output Tokens/Sec",
+        )
+        self._add_stats_for_metric(
+            headers,
+            values,
+            benchmark.metrics.tokens_per_second,
+            "Token Throughput",
+            "Total Tokens/Sec",
+        )
+        self._add_stats_for_metric(
+            headers,
+            values,
+            benchmark.metrics.output_tokens_per_iteration,
+            "Token Streaming",
+            "Output Tokens/Iter",
+        )
+        self._add_stats_for_metric(
+            headers,
+            values,
+            benchmark.metrics.iter_tokens_per_iteration,
+            "Token Streaming",
+            "Iter Tokens/Iter",
+        )
+
+    def _add_modality_metrics(
+        self,
+        benchmark: GenerativeBenchmark,
+        modality: Literal["text", "image", "video", "audio"],
+        headers: list[list[str]],
+        values: list[str | int | float],
+    ) -> None:
+        """
+        Add modality-specific metrics for text, image, video, or audio data.
+
+        :param benchmark: Benchmark data to extract modality metrics from
+        :param modality: Type of modality to extract metrics for
+        :param headers: List of header hierarchies to append to
+        :param values: List of values to append to
+        """
+        modality_summary = getattr(benchmark.metrics, modality)
+        metric_definitions = MODALITY_METRICS[modality]
+
+        for metric_name, display_name in metric_definitions:
+            metric_obj = getattr(modality_summary, metric_name, None)
+            if metric_obj is None:
+                continue
+
+            for io_type in ["input", "output", "total"]:
+                dist_summary = getattr(metric_obj, io_type, None)
+                if dist_summary is None:
+                    continue
+
+                if not self._has_distribution_data(dist_summary):
+                    continue
+
+                self._add_stats_for_metric(
+                    headers,
+                    values,
+                    dist_summary,
+                    f"{modality.capitalize()} {display_name}",
+                    io_type.capitalize(),
+                )
+
+    def _has_distribution_data(self, dist_summary: StatusDistributionSummary) -> bool:
+        """
+        Check if distribution summary contains any data.
+
+        :param dist_summary: Distribution summary to check
+        :return: True if summary contains data, False otherwise
+        """
+        return any(
+            getattr(dist_summary, status, None) is not None
+            and getattr(dist_summary, status).total_sum > 0.0
+            for status in ["successful", "incomplete", "errored"]
+        )
+
+    def _add_scheduler_info(
+        self,
+        benchmark: GenerativeBenchmark,
+        headers: list[list[str]],
+        values: list[str | int | float],
+    ) -> None:
+        """
+        Add scheduler state and performance information.
+
+        :param benchmark: Benchmark data to extract scheduler info from
+        :param headers: List of header hierarchies to append to
+        :param values: List of values to append to
+        """
+        self._add_scheduler_state(benchmark, headers, values)
+        self._add_scheduler_metrics(benchmark, headers, values)
+
+    def _add_scheduler_state(
+        self,
+        benchmark: GenerativeBenchmark,
+        headers: list[list[str]],
+        values: list[str | int | float],
+    ) -> None:
+        """
+        Add scheduler state information including request counts and timing.
+
+        :param benchmark: Benchmark data to extract scheduler state from
+        :param headers: List of header hierarchies to append to
+        :param values: List of values to append to
+        """
+        state = benchmark.scheduler_state
+
+        state_fields: list[tuple[str, Any]] = [
+            ("Node ID", state.node_id),
+            ("Num Processes", state.num_processes),
+            ("Created Requests", state.created_requests),
+            ("Processed Requests", state.processed_requests),
+            ("Successful Requests", state.successful_requests),
+            ("Errored Requests", state.errored_requests),
+            ("Cancelled Requests", state.cancelled_requests),
+        ]
+
+        for field_name, value in state_fields:
+            self._add_field(headers, values, "Scheduler State", field_name, value)
+
+        if state.end_queuing_time:
+            self._add_field(
+                headers,
+                values,
+                "Scheduler State",
+                "End Queuing Time",
+                safe_format_timestamp(state.end_queuing_time, TIMESTAMP_FORMAT),
+            )
+            end_queuing_constraints_dict = {
+                key: constraint.model_dump()
+                for key, constraint in state.end_queuing_constraints.items()
+            }
+            self._add_field(
+                headers,
+                values,
+                "Scheduler State",
+                "End Queuing Constraints",
+                json.dumps(end_queuing_constraints_dict),
+            )
+
+        if state.end_processing_time:
+            self._add_field(
+                headers,
+                values,
+                "Scheduler State",
+                "End Processing Time",
+                safe_format_timestamp(state.end_processing_time, TIMESTAMP_FORMAT),
+            )
+            end_processing_constraints_dict = {
+                key: constraint.model_dump()
+                for key, constraint in state.end_processing_constraints.items()
+            }
+            self._add_field(
+                headers,
+                values,
+                "Scheduler State",
+                "End Processing Constraints",
+                json.dumps(end_processing_constraints_dict),
+            )
+
+    def _add_scheduler_metrics(
+        self,
+        benchmark: GenerativeBenchmark,
+        headers: list[list[str]],
+        values: list[str | int | float],
+    ) -> None:
+        """
+        Add scheduler performance metrics including delays and processing times.
+
+        :param benchmark: Benchmark data to extract scheduler metrics from
+        :param headers: List of header hierarchies to append to
+        :param values: List of values to append to
+        """
+        metrics = benchmark.scheduler_metrics
+
+        requests_made_fields: list[tuple[str, int]] = [
+            ("Requests Made Successful", metrics.requests_made.successful),
+            ("Requests Made Incomplete", metrics.requests_made.incomplete),
+            ("Requests Made Errored", metrics.requests_made.errored),
+            ("Requests Made Total", metrics.requests_made.total),
+        ]
+        for field_name, value in requests_made_fields:
+            self._add_field(headers, values, "Scheduler Metrics", field_name, value)
+
+        timing_metrics: list[tuple[str, float]] = [
+            ("Queued Time Avg", metrics.queued_time_avg),
+            ("Resolve Start Delay Avg", metrics.resolve_start_delay_avg),
+            (
+                "Resolve Targeted Start Delay Avg",
+                metrics.resolve_targeted_start_delay_avg,
+            ),
+            ("Request Start Delay Avg", metrics.request_start_delay_avg),
+            (
+                "Request Targeted Start Delay Avg",
+                metrics.request_targeted_start_delay_avg,
+            ),
+            ("Request Time Avg", metrics.request_time_avg),
+            ("Resolve End Delay Avg", metrics.resolve_end_delay_avg),
+            ("Resolve Time Avg", metrics.resolve_time_avg),
+            ("Finalized Delay Avg", metrics.finalized_delay_avg),
+            ("Processed Delay Avg", metrics.processed_delay_avg),
+        ]
+        for field_name, timing in timing_metrics:
+            self._add_field(
+                headers, values, "Scheduler Metrics", field_name, timing, "Sec"
+            )
+
+    def _add_stats_for_metric(
+        self,
+        headers: list[list[str]],
+        values: list[str | int | float],
+        metric: StatusDistributionSummary | DistributionSummary,
+        group: str,
+        units: str,
+    ) -> None:
+        """
+        Add statistical summaries for a metric across all statuses.
+
+        :param headers: List of header hierarchies to append to
+        :param values: List of values to append to
+        :param metric: Distribution summary to extract statistics from
+        :param group: Top-level category for the metric
+        :param units: Units for the metric values
+        """
+        if isinstance(metric, StatusDistributionSummary):
+            for status in ["successful", "incomplete", "errored"]:
+                dist = getattr(metric, status, None)
+                if dist is None or dist.total_sum == 0.0:
+                    continue
+                self._add_distribution_stats(
+                    headers, values, dist, group, units, status
+                )
+        else:
+            self._add_distribution_stats(headers, values, metric, group, units, None)
+
+    def _add_distribution_stats(
+        self,
+        headers: list[list[str]],
+        values: list[str | int | float],
+        dist: DistributionSummary,
+        group: str,
+        units: str,
+        status: str | None,
+    ) -> None:
+        """
+        Add distribution statistics including mean, median, and percentiles.
+
+        :param headers: List of header hierarchies to append to
+        :param values: List of values to append to
+        :param dist: Distribution summary with statistical data
+        :param group: Top-level category for the metric
+        :param units: Units for the metric values
+        :param status: Request status (successful, incomplete, errored) or None
+        """
+        status_prefix = f"{status.capitalize()} " if status else ""
+
+        headers.append([group, f"{status_prefix}{units}", "Mean"])
+        values.append(dist.mean)
+
+        headers.append([group, f"{status_prefix}{units}", "Median"])
+        values.append(dist.median)
+
+        headers.append([group, f"{status_prefix}{units}", "Std Dev"])
+        values.append(dist.std_dev)
+
+        headers.append([group, f"{status_prefix}{units}", "Percentiles"])
+        percentiles_str = (
+            f"[{dist.min}, {dist.percentiles.p001}, {dist.percentiles.p01}, "
+            f"{dist.percentiles.p05}, {dist.percentiles.p10}, {dist.percentiles.p25}, "
+            f"{dist.percentiles.p75}, {dist.percentiles.p90}, {dist.percentiles.p95}, "
+            f"{dist.percentiles.p99}, {dist.max}]"
+        )
+        values.append(percentiles_str)
diff --git a/src/guidellm/benchmark/outputs/html.py b/src/guidellm/benchmark/outputs/html.py
new file mode 100644
index 00000000..34cf7107
--- /dev/null
+++ b/src/guidellm/benchmark/outputs/html.py
@@ -0,0 +1,422 @@
+"""
+HTML output formatter for benchmark results.
+
+Transforms benchmark data into interactive web-based reports by building UI data
+structures, converting keys to camelCase for JavaScript compatibility, and injecting
+formatted data into HTML templates. The formatter processes GenerativeBenchmark
+instances and their associated metrics, creating histogram buckets for distributions,
+formatting percentile statistics for tabular display, and embedding all data as
+JavaScript objects within an HTML template for client-side rendering and visualization.
+"""
+
+from __future__ import annotations
+
+import json
+import random
+import re
+from collections import defaultdict
+from copy import deepcopy
+from math import ceil
+from pathlib import Path
+from typing import Any, ClassVar
+
+from loguru import logger
+from pydantic import BaseModel, Field, computed_field
+
+from guidellm.benchmark.outputs.output import GenerativeBenchmarkerOutput
+from guidellm.benchmark.schemas import (
+    BenchmarkGenerativeTextArgs,
+    GenerativeBenchmark,
+    GenerativeBenchmarksReport,
+)
+from guidellm.schemas import DistributionSummary
+from guidellm.settings import settings
+from guidellm.utils import camelize_str, recursive_key_update
+from guidellm.utils.text import load_text
+
+__all__ = ["GenerativeBenchmarkerHTML"]
+
+
+@GenerativeBenchmarkerOutput.register("html")
+class GenerativeBenchmarkerHTML(GenerativeBenchmarkerOutput):
+    """
+    HTML output formatter for benchmark results.
+
+    Generates interactive HTML reports from benchmark data by transforming results
+    into camelCase JSON structures and injecting them into HTML templates. The
+    formatter processes benchmark metrics, creates histogram distributions, and
+    embeds all data into a pre-built HTML template for browser-based visualization.
+    Reports are saved to the specified output path or current working directory.
+
+    :cvar DEFAULT_FILE: Default filename for HTML output when a directory is provided
+    """
+
+    DEFAULT_FILE: ClassVar[str] = "benchmarks.html"
+
+    output_path: Path = Field(
+        default_factory=lambda: Path.cwd(),
+        description=(
+            "Directory or file path for saving the HTML report, "
+            "defaults to current working directory"
+        ),
+    )
+
+    @classmethod
+    def validated_kwargs(
+        cls, output_path: str | Path | None, **_kwargs
+    ) -> dict[str, Any]:
+        """
+        Validate and normalize output path argument.
+
+        :param output_path: Output file or directory path for the HTML report
+        :return: Dictionary containing validated output_path if provided
+        """
+        validated: dict[str, Any] = {}
+        if output_path is not None:
+            validated["output_path"] = (
+                Path(output_path) if not isinstance(output_path, Path) else output_path
+            )
+        return validated
+
+    async def finalize(self, report: GenerativeBenchmarksReport) -> Path:
+        """
+        Generate and save the HTML benchmark report.
+
+        Transforms benchmark data into camelCase JSON format, injects it into the
+        HTML template, and writes the resulting report to the output path. Creates
+        parent directories if they don't exist.
+
+        :param report: Completed benchmark report containing all results
+        :return: Path to the saved HTML report file
+        """
+        output_path = self.output_path
+        if output_path.is_dir():
+            output_path = output_path / self.DEFAULT_FILE
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+
+        data = _build_ui_data(report.benchmarks, report.args)
+        camel_data = recursive_key_update(deepcopy(data), camelize_str)
+
+        ui_api_data = {
+            f"window.{key} = {{}};": f"window.{key} = {json.dumps(value, indent=2)};\n"
+            for key, value in camel_data.items()
+        }
+
+        _create_html_report(ui_api_data, output_path)
+
+        return output_path
+
+
+class _Bucket(BaseModel):
+    """
+    Histogram bucket for data distribution visualization.
+
+    Represents a single bucket in a histogram with its starting value and count
+    of data points falling within the bucket range. Used to create distribution
+    histograms for metrics like token counts and request timings.
+    """
+
+    value: float | int = Field(description="Starting value of the bucket range")
+    count: int = Field(description="Number of data points falling within this bucket")
+
+    @staticmethod
+    def from_data(
+        data: list[float] | list[int],
+        bucket_width: float | None = None,
+        n_buckets: int | None = None,
+    ) -> tuple[list[_Bucket], float]:
+        """
+        Create histogram buckets from numeric data values.
+
+        Divides the data range into equal-width buckets and counts values within
+        each bucket. Either bucket_width or n_buckets can be specified; if neither
+        is provided, defaults to 10 buckets.
+
+        :param data: Numeric values to bucket
+        :param bucket_width: Width of each bucket, computed if None
+        :param n_buckets: Number of buckets, defaults to 10 if width not specified
+        :return: Tuple of bucket list and computed bucket width
+        """
+        if not data:
+            return [], 1.0
+
+        min_v = min(data)
+        max_v = max(data)
+        range_v = (1 + max_v) - min_v
+
+        if bucket_width is None:
+            if n_buckets is None:
+                n_buckets = 10
+            bucket_width = range_v / n_buckets
+        else:
+            n_buckets = ceil(range_v / bucket_width)
+
+        bucket_counts: defaultdict[float | int, int] = defaultdict(int)
+        for val in data:
+            idx = int((val - min_v) // bucket_width)
+            if idx >= n_buckets:
+                idx = n_buckets - 1
+            bucket_start = min_v + idx * bucket_width
+            bucket_counts[bucket_start] += 1
+
+        buckets = [
+            _Bucket(value=start, count=count)
+            for start, count in sorted(bucket_counts.items())
+        ]
+        return buckets, bucket_width
+
+
+class _TabularDistributionSummary(DistributionSummary):
+    """
+    Distribution summary with tabular percentile representation.
+
+    Extends DistributionSummary to provide percentile data formatted for table
+    display in the HTML report. Filters to show only key percentiles (p50, p90,
+    p95, p99) for concise presentation.
+    """
+
+    @computed_field
+    def percentile_rows(self) -> list[dict[str, str | float]]:
+        """
+        Format percentiles as table rows for UI display.
+
+        :return: List of dictionaries with percentile names and values
+        """
+        rows = [
+            {"percentile": name, "value": value}
+            for name, value in self.percentiles.model_dump().items()
+        ]
+        return list(
+            filter(lambda row: row["percentile"] in ["p50", "p90", "p95", "p99"], rows)
+        )
+
+    @classmethod
+    def from_distribution_summary(
+        cls, distribution: DistributionSummary
+    ) -> _TabularDistributionSummary:
+        """
+        Convert standard DistributionSummary to tabular format.
+
+        :param distribution: Source distribution summary to convert
+        :return: Tabular distribution summary with formatted percentile rows
+        """
+        return cls(**distribution.model_dump())
+
+
+def _create_html_report(js_data: dict[str, str], output_path: Path) -> Path:
+    """
+    Create HTML report by injecting JavaScript data into template.
+
+    Loads the HTML template, injects JavaScript data into the head section, and
+    writes the final report to the specified output path.
+
+    :param js_data: Dictionary mapping placeholder strings to JavaScript code
+    :param output_path: Path where HTML report will be saved
+    :return: Path to the saved report file
+    """
+    html_content = load_text(settings.report_generation.source)
+    report_content = _inject_data(js_data, html_content)
+
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    output_path.write_text(report_content)
+    return output_path
+
+
+def _inject_data(js_data: dict[str, str], html: str) -> str:
+    """
+    Inject JavaScript data into HTML head section.
+
+    Replaces placeholder strings in the HTML head section with actual JavaScript
+    code containing benchmark data. Returns original HTML if no head section found.
+
+    :param js_data: Dictionary mapping placeholder strings to JavaScript code
+    :param html: HTML template content
+    :return: HTML with injected JavaScript data
+    """
+    head_match = re.search(r"<head[^>]*>(.*?)</head>", html, re.DOTALL | re.IGNORECASE)
+    if not head_match:
+        logger.warning("<head> section missing, returning original HTML.")
+        return html
+
+    head_content = head_match.group(1)
+
+    for placeholder, script in js_data.items():
+        head_content = head_content.replace(placeholder, script)
+
+    new_head = f"<head>{head_content}</head>"
+    return html[: head_match.start()] + new_head + html[head_match.end() :]
+
+
+def _build_ui_data(
+    benchmarks: list[GenerativeBenchmark], args: BenchmarkGenerativeTextArgs
+) -> dict[str, Any]:
+    """
+    Build complete UI data structure from benchmarks.
+
+    Aggregates benchmark results into a structured format for the HTML UI,
+    including run metadata, workload details, and per-benchmark metrics.
+
+    :param benchmarks: List of completed benchmark results
+    :param args: Benchmark configuration arguments
+    :return: Dictionary with run_info, workload_details, and benchmarks sections
+    """
+    return {
+        "run_info": _build_run_info(benchmarks, args),
+        "workload_details": _build_workload_details(benchmarks, args),
+        "benchmarks": _build_benchmarks(benchmarks),
+    }
+
+
+def _build_run_info(
+    benchmarks: list[GenerativeBenchmark], args: BenchmarkGenerativeTextArgs
+) -> dict[str, Any]:
+    """
+    Build run metadata from benchmarks.
+
+    Extracts model name, timestamp, and dataset information from the benchmark
+    configuration and results.
+
+    :param benchmarks: List of completed benchmark results
+    :param args: Benchmark configuration arguments
+    :return: Dictionary with model, task, timestamp, and dataset information
+    """
+    model = args.model or "N/A"
+    timestamp = max(bm.start_time for bm in benchmarks if bm.start_time is not None)
+    return {
+        "model": {"name": model, "size": 0},
+        "task": "N/A",
+        "timestamp": timestamp,
+        "dataset": {"name": "N/A"},
+    }
+
+
+def _build_workload_details(
+    benchmarks: list[GenerativeBenchmark], args: BenchmarkGenerativeTextArgs
+) -> dict[str, Any]:
+    """
+    Build workload details from benchmarks.
+
+    Aggregates prompt and generation samples, token distribution statistics,
+    request timing histograms, and server configuration. Samples up to 5 random
+    prompts and outputs for display.
+
+    :param benchmarks: List of completed benchmark results
+    :param args: Benchmark configuration arguments
+    :return: Dictionary with prompts, generations, request timing, and server info
+    """
+    target = args.target
+    rate_type = benchmarks[0].config.strategy.type_
+    successful_requests = [req for bm in benchmarks for req in bm.requests.successful]
+
+    sample_indices = random.sample(
+        range(len(successful_requests)), min(5, len(successful_requests))
+    )
+    sample_prompts = [
+        req.request_args.replace("\n", " ").replace('"', "'")
+        if (req := successful_requests[i]).request_args
+        else ""
+        for i in sample_indices
+    ]
+    sample_outputs = [
+        req.output.replace("\n", " ").replace('"', "'")
+        if (req := successful_requests[i]).output
+        else ""
+        for i in sample_indices
+    ]
+
+    prompt_tokens = [
+        float(req.prompt_tokens) if req.prompt_tokens is not None else -1
+        for bm in benchmarks
+        for req in bm.requests.successful
+    ]
+    output_tokens = [
+        float(req.output_tokens) if req.output_tokens is not None else -1
+        for bm in benchmarks
+        for req in bm.requests.successful
+    ]
+
+    prompt_token_buckets, _prompt_bucket_width = _Bucket.from_data(prompt_tokens, 1)
+    output_token_buckets, _output_bucket_width = _Bucket.from_data(output_tokens, 1)
+
+    prompt_token_stats = DistributionSummary.from_values(prompt_tokens)
+    output_token_stats = DistributionSummary.from_values(output_tokens)
+
+    min_start_time = benchmarks[0].start_time
+    all_req_times = [
+        req.info.timings.request_start - min_start_time
+        for bm in benchmarks
+        for req in bm.requests.successful
+        if req.info.timings.request_start is not None
+    ]
+
+    number_of_buckets = len(benchmarks)
+    request_buckets, bucket_width = _Bucket.from_data(
+        all_req_times, None, number_of_buckets
+    )
+
+    return {
+        "prompts": {
+            "samples": sample_prompts,
+            "token_distributions": {
+                "statistics": prompt_token_stats.model_dump()
+                if prompt_token_stats
+                else None,
+                "buckets": [b.model_dump() for b in prompt_token_buckets],
+                "bucket_width": 1,
+            },
+        },
+        "generations": {
+            "samples": sample_outputs,
+            "token_distributions": {
+                "statistics": output_token_stats.model_dump()
+                if output_token_stats
+                else None,
+                "buckets": [b.model_dump() for b in output_token_buckets],
+                "bucket_width": 1,
+            },
+        },
+        "requests_over_time": {
+            "requests_over_time": {
+                "buckets": [b.model_dump() for b in request_buckets],
+                "bucket_width": bucket_width,
+            },
+            "num_benchmarks": number_of_buckets,
+        },
+        "rate_type": rate_type,
+        "server": {"target": target},
+    }
+
+
+def _build_benchmarks(benchmarks: list[GenerativeBenchmark]) -> list[dict[str, Any]]:
+    """
+    Build benchmark metrics data for UI display.
+
+    Extracts key performance metrics from each benchmark including requests per
+    second, inter-token latency, time to first token, throughput, and request
+    latency. Formats distribution summaries for tabular display.
+
+    :param benchmarks: List of completed benchmark results
+    :return: List of dictionaries with formatted benchmark metrics
+    """
+    result = []
+    for bm in benchmarks:
+        result.append(
+            {
+                "requests_per_second": bm.metrics.requests_per_second.successful.mean,
+                "itl": _TabularDistributionSummary.from_distribution_summary(
+                    bm.metrics.inter_token_latency_ms.successful
+                ).model_dump(),
+                "ttft": _TabularDistributionSummary.from_distribution_summary(
+                    bm.metrics.time_to_first_token_ms.successful
+                ).model_dump(),
+                "throughput": _TabularDistributionSummary.from_distribution_summary(
+                    bm.metrics.output_tokens_per_second.successful
+                ).model_dump(),
+                "time_per_request": (
+                    _TabularDistributionSummary.from_distribution_summary(
+                        bm.metrics.request_latency.successful
+                    ).model_dump()
+                ),
+            }
+        )
+    return result
diff --git a/src/guidellm/benchmark/outputs/output.py b/src/guidellm/benchmark/outputs/output.py
new file mode 100644
index 00000000..8eb021b0
--- /dev/null
+++ b/src/guidellm/benchmark/outputs/output.py
@@ -0,0 +1,158 @@
+"""
+Base output interface for generative benchmarking results.
+
+This module defines the abstract base class for all benchmark output formatters in
+the guidellm system. Output formatters transform benchmark reports into various file
+formats (JSON, CSV, HTML, etc.) enabling flexible result persistence and analysis.
+The module leverages a registry pattern for dynamic format resolution and supports
+both direct instantiation and configuration-based initialization.
+"""
+
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from collections.abc import Mapping, Sequence
+from pathlib import Path
+from typing import Any
+
+from pydantic import BaseModel, ConfigDict
+
+from guidellm.benchmark.schemas import GenerativeBenchmarksReport
+from guidellm.utils import RegistryMixin
+
+__all__ = ["GenerativeBenchmarkerOutput"]
+
+
+class GenerativeBenchmarkerOutput(
+    BaseModel, RegistryMixin[type["GenerativeBenchmarkerOutput"]], ABC
+):
+    """
+    Abstract base for benchmark output formatters with registry support.
+
+    Defines the interface for transforming benchmark reports into various output
+    formats. Subclasses implement specific formatters (JSON, CSV, HTML) that can be
+    registered and resolved dynamically. Supports flexible initialization from string
+    identifiers, file paths, or configuration dictionaries enabling declarative
+    output configuration in benchmark runs.
+
+    Example:
+        ::
+            # Register and resolve output formats
+            outputs = GenerativeBenchmarkerOutput.resolve(
+                output_formats=["json", "csv"],
+                output_path="./results"
+            )
+
+            # Finalize outputs with benchmark report
+            for output in outputs.values():
+                await output.finalize(report)
+    """
+
+    model_config = ConfigDict(
+        extra="ignore",
+        arbitrary_types_allowed=True,
+        validate_assignment=True,
+        from_attributes=True,
+        use_enum_values=True,
+    )
+
+    @classmethod
+    @abstractmethod
+    def validated_kwargs(cls, *args, **kwargs) -> dict[str, Any]:
+        """
+        Validate and normalize initialization arguments for output formatter.
+
+        Processes positional and keyword arguments into a validated parameter
+        dictionary suitable for formatter instantiation. Subclasses implement
+        format-specific validation logic handling their unique parameter patterns.
+
+        :param args: Positional arguments for formatter configuration
+        :param kwargs: Keyword arguments for formatter configuration
+        :return: Validated dictionary of parameters for formatter creation
+        :raises NotImplementedError: Must be implemented by subclasses
+        """
+        ...
+
+    @classmethod
+    def resolve(
+        cls,
+        output_formats: (
+            Sequence[str]
+            | Mapping[str, Any | dict[str, Any] | GenerativeBenchmarkerOutput]
+            | None
+        ),
+        output_path: str | Path | None,
+    ) -> dict[str, GenerativeBenchmarkerOutput]:
+        """
+        Resolve output format specifications into formatter instances.
+
+        Supports multiple input patterns: format identifiers (["json", "csv"]),
+        file paths (["results.json"]), format configurations ({"json": {"indent": 2}}),
+        or pre-instantiated formatters. Registered format types are resolved from the
+        registry and instantiated with validated parameters.
+
+        :param output_formats: Format specifications as sequence of identifiers/paths,
+            mapping of format configurations, or None for no outputs
+        :param output_path: Default output directory path for all formatters
+        :return: Dictionary mapping format keys to instantiated formatter instances
+        :raises TypeError: If format specification type is invalid
+        :raises ValueError: If format resolution or validation fails
+        """
+        resolved: dict[str, GenerativeBenchmarkerOutput] = {}
+
+        if not output_formats:
+            return resolved
+
+        if isinstance(output_formats, list | tuple):
+            # convert to dict for uniform processing
+            formats_list = output_formats
+            output_formats = {}
+            for output_format in formats_list:
+                # Check for registered type, if not, then assume it's a file path
+                if cls.is_registered(output_format):
+                    output_formats[output_format] = {}
+                else:
+                    path = Path(output_format)
+                    format_type = path.suffix[1:].lower()
+                    output_formats[format_type] = {"output_path": path}
+
+        for key, val in output_formats.items():  # type: ignore[union-attr]
+            if isinstance(val, GenerativeBenchmarkerOutput):
+                resolved[key] = val
+            else:
+                output_class = cls.get_registered_object(key)
+                if output_class is None:
+                    available_formats = (
+                        list(cls.registry.keys()) if cls.registry else []
+                    )
+                    raise ValueError(
+                        f"Output format '{key}' is not registered. "
+                        f"Available formats: {available_formats}"
+                    )
+
+                kwargs: dict[str, Any] = {"output_path": output_path}
+
+                if isinstance(val, dict):
+                    kwargs.update(val)
+                    kwargs = output_class.validated_kwargs(**kwargs)
+                else:
+                    kwargs = output_class.validated_kwargs(val, **kwargs)
+
+                resolved[key] = output_class(**kwargs)
+
+        return resolved
+
+    @abstractmethod
+    async def finalize(self, report: GenerativeBenchmarksReport) -> Any:
+        """
+        Process and persist benchmark report in the formatter's output format.
+
+        Transforms the provided benchmark report into the target format and writes
+        results to the configured output destination. Implementation details vary by
+        formatter type (file writing, API calls, etc.).
+
+        :param report: Benchmark report containing results to format and output
+        :return: Format-specific output result (file path, response object, etc.)
+        :raises NotImplementedError: Must be implemented by subclasses
+        """
+        ...
diff --git a/src/guidellm/benchmark/outputs/serialized.py b/src/guidellm/benchmark/outputs/serialized.py
new file mode 100644
index 00000000..52dc632a
--- /dev/null
+++ b/src/guidellm/benchmark/outputs/serialized.py
@@ -0,0 +1,69 @@
+"""
+Serialized output handler for generative benchmark reports.
+
+This module provides a serialized output implementation that saves benchmark reports
+to JSON or YAML file formats. It extends the base GenerativeBenchmarkerOutput to
+handle file-based persistence of benchmark results, supporting both directory and
+explicit file path specifications for report serialization.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+from pydantic import Field
+
+from guidellm.benchmark.outputs.output import GenerativeBenchmarkerOutput
+from guidellm.benchmark.schemas import GenerativeBenchmarksReport
+
+__all__ = ["GenerativeBenchmarkerSerialized"]
+
+
+@GenerativeBenchmarkerOutput.register(["json", "yaml"])
+class GenerativeBenchmarkerSerialized(GenerativeBenchmarkerOutput):
+    """
+    Serialized output handler for benchmark reports in JSON or YAML formats.
+
+    This output handler persists generative benchmark reports to the file system in
+    either JSON or YAML format. It supports flexible path specification, allowing
+    users to provide either a directory (where a default filename will be generated)
+    or an explicit file path for the serialized report output.
+
+    Example:
+    ::
+        output = GenerativeBenchmarkerSerialized(output_path="/path/to/output.json")
+        result_path = await output.finalize(report)
+    """
+
+    output_path: Path = Field(
+        default_factory=lambda: Path.cwd(),
+        description="Directory or file path for saving the serialized report",
+    )
+
+    @classmethod
+    def validated_kwargs(
+        cls, output_path: str | Path | None, **_kwargs
+    ) -> dict[str, Any]:
+        """
+        Validate and normalize output path keyword arguments.
+
+        :param output_path: Directory or file path for serialization output
+        :param _kwargs: Additional keyword arguments (ignored)
+        :return: Dictionary of validated keyword arguments for class initialization
+        """
+        validated: dict[str, Any] = {}
+        if output_path is not None:
+            validated["output_path"] = (
+                Path(output_path) if not isinstance(output_path, Path) else output_path
+            )
+        return validated
+
+    async def finalize(self, report: GenerativeBenchmarksReport) -> Path:
+        """
+        Serialize and save the benchmark report to the configured output path.
+
+        :param report: The generative benchmarks report to serialize
+        :return: Path to the saved report file
+        """
+        return report.save_file(self.output_path)
diff --git a/src/guidellm/benchmark/profile.py b/src/guidellm/benchmark/profile.py
index 4b3f36fd..dc0a30d0 100644
--- a/src/guidellm/benchmark/profile.py
+++ b/src/guidellm/benchmark/profile.py
@@ -33,11 +33,10 @@
     ConstraintInitializer,
     ConstraintsInitializerFactory,
     SchedulingStrategy,
-    StrategyType,
     SynchronousStrategy,
     ThroughputStrategy,
 )
-from guidellm.utils import PydanticClassRegistryMixin
+from guidellm.schemas import PydanticClassRegistryMixin
 
 if TYPE_CHECKING:
     from guidellm.benchmark.schemas import Benchmark
@@ -56,7 +55,7 @@
 
 
 class Profile(
-    PydanticClassRegistryMixin["type[Profile]"],
+    PydanticClassRegistryMixin["Profile"],
     ABC,
 ):
     """
@@ -74,6 +73,7 @@ class Profile(
 
     @classmethod
     def __pydantic_schema_base_type__(cls) -> type[Profile]:
+        """Return the base type for polymorphic validation hierarchy."""
         if cls.__name__ == "Profile":
             return cls
 
@@ -97,7 +97,10 @@ def create(
         :return: Configured profile instance for the specified type
         :raises ValueError: If rate_type is not registered
         """
-        profile_class: type[Profile] = cls.get_registered_object(rate_type)
+        profile_class = cls.get_registered_object(rate_type)
+        if profile_class is None:
+            raise ValueError(f"Profile type '{rate_type}' is not registered")
+
         resolved_kwargs = profile_class.resolve_args(
             rate_type=rate_type, rate=rate, random_seed=random_seed, **kwargs
         )
@@ -138,7 +141,7 @@ def resolve_args(
 
     @computed_field  # type: ignore[misc]
     @property
-    def strategy_types(self) -> list[StrategyType]:
+    def strategy_types(self) -> list[str]:
         """
         :return: Strategy types executed or expected to execute in this profile
         """
@@ -147,10 +150,7 @@ def strategy_types(self) -> list[StrategyType]:
     def strategies_generator(
         self,
     ) -> Generator[
-        tuple[
-            SchedulingStrategy | None,
-            dict[str, Any | dict[str, Any] | Constraint] | None,
-        ],
+        tuple[SchedulingStrategy, dict[str, Constraint] | None],
         Benchmark | None,
         None,
     ]:
@@ -196,7 +196,7 @@ def next_strategy_constraints(
         next_strategy: SchedulingStrategy | None,
         prev_strategy: SchedulingStrategy | None,
         prev_benchmark: Benchmark | None,
-    ) -> dict[str, Any | dict[str, Any] | Constraint] | None:
+    ) -> dict[str, Constraint] | None:
         """
         Generate constraints for the next strategy execution.
 
@@ -225,14 +225,16 @@ def _constraints_validator(
 
         return {
             key: (
-                val
-                if not isinstance(val, ConstraintInitializer)
-                else ConstraintsInitializerFactory.deserialize(initializer_dict=val)
+                ConstraintsInitializerFactory.deserialize(initializer_dict=val)
+                if isinstance(val, dict)
+                and "type_" in val
+                and not isinstance(val, ConstraintInitializer)
+                else val
             )
             for key, val in value.items()
         }
 
-    @field_serializer
+    @field_serializer("constraints")
     def _constraints_serializer(
         self,
         constraints: dict[str, Any | dict[str, Any] | ConstraintInitializer] | None,
@@ -281,7 +283,7 @@ def resolve_args(
         return kwargs
 
     @property
-    def strategy_types(self) -> list[StrategyType]:
+    def strategy_types(self) -> list[str]:
         """
         :return: Single synchronous strategy type
         """
@@ -346,7 +348,7 @@ def resolve_args(
         return kwargs
 
     @property
-    def strategy_types(self) -> list[StrategyType]:
+    def strategy_types(self) -> list[str]:
         """
         :return: Concurrent strategy types for each configured stream count
         """
@@ -419,7 +421,7 @@ def resolve_args(
         return kwargs
 
     @property
-    def strategy_types(self) -> list[StrategyType]:
+    def strategy_types(self) -> list[str]:
         """
         :return: Single throughput strategy type
         """
@@ -510,7 +512,7 @@ def resolve_args(
         return kwargs
 
     @property
-    def strategy_types(self) -> list[StrategyType]:
+    def strategy_types(self) -> list[str]:
         """
         :return: Async strategy types for each configured rate
         """
@@ -622,7 +624,7 @@ def resolve_args(
         return kwargs
 
     @property
-    def strategy_types(self) -> list[StrategyType]:
+    def strategy_types(self) -> list[str]:
         """
         :return: Strategy types for the complete sweep sequence
         """
@@ -637,8 +639,8 @@ def next_strategy(
     ) -> (
         AsyncConstantStrategy
         | AsyncPoissonStrategy
-        | SynchronousProfile
-        | ThroughputProfile
+        | SynchronousStrategy
+        | ThroughputStrategy
         | None
     ):
         """
@@ -656,9 +658,7 @@ def next_strategy(
             return SynchronousStrategy()
 
         if prev_strategy.type_ == "synchronous":
-            self.synchronous_rate = prev_benchmark.get_request_metrics_sample()[
-                "request_throughput"
-            ]
+            self.synchronous_rate = prev_benchmark.request_throughput.successful.mean
 
             return ThroughputStrategy(
                 max_concurrency=self.max_concurrency,
@@ -666,9 +666,7 @@ def next_strategy(
             )
 
         if prev_strategy.type_ == "throughput":
-            self.throughput_rate = prev_benchmark.get_request_metrics_sample()[
-                "request_throughput"
-            ]
+            self.throughput_rate = prev_benchmark.request_throughput.successful.mean
             if self.synchronous_rate <= 0 and self.throughput_rate <= 0:
                 raise RuntimeError(
                     "Invalid rates in sweep; aborting. "
diff --git a/src/guidellm/benchmark/progress.py b/src/guidellm/benchmark/progress.py
index 558def67..2e6c3274 100644
--- a/src/guidellm/benchmark/progress.py
+++ b/src/guidellm/benchmark/progress.py
@@ -1,17 +1,11 @@
 """
-Benchmark progress tracking and console display abstractions.
+Progress tracking and console display for benchmark execution monitoring.
 
-Provides progress tracking interfaces and implementations for monitoring benchmark
-execution, displaying real-time statistics, and managing UI updates during
-generative benchmarking operations.
-
-Classes:
-    BenchmarkerProgress: Abstract base for benchmark progress tracking.
-    BenchmarkerProgressGroup: Composite progress handler for multiple instances.
-    GenerativeConsoleBenchmarkerProgress: Console-based progress display.
-
-Type Variables:
-    BenchmarkT: Generic benchmark object type.
+Provides abstract interfaces and concrete implementations for tracking benchmark
+progress during execution. The module enables real-time display of benchmark
+statistics, metrics, and execution state through console-based UI components.
+Primary use cases include monitoring generative benchmark runs with detailed
+request/token statistics and scheduler state updates.
 """
 
 from __future__ import annotations
@@ -37,93 +31,92 @@
 
 from guidellm.benchmark.profile import Profile
 from guidellm.benchmark.schemas import (
+    BenchmarkAccumulatorT,
     BenchmarkT,
-    EstimatedBenchmarkState,
     GenerativeBenchmark,
+    GenerativeBenchmarkAccumulator,
 )
-from guidellm.scheduler import SchedulerState, SchedulingStrategy, StrategyType
+from guidellm.scheduler import SchedulerState, SchedulingStrategy
 from guidellm.utils import Colors, format_value_display
 
 __all__ = ["BenchmarkerProgress", "GenerativeConsoleBenchmarkerProgress"]
 
 
-class BenchmarkerProgress(Generic[BenchmarkT], ABC):
+class BenchmarkerProgress(Generic[BenchmarkAccumulatorT, BenchmarkT], ABC):
     """
-    Abstract base class for tracking and displaying benchmark progress.
+    Abstract interface for tracking and displaying benchmark execution progress.
 
-    Provides lifecycle hooks for monitoring benchmark execution stages including
-    initialization, start, updates, completion, and finalization. Supports
-    enable/disable functionality for conditional progress tracking.
+    Provides lifecycle hooks for monitoring benchmark stages including initialization,
+    execution start, progress updates, completion, and finalization. Implementations
+    handle display updates, progress tracking, and resource management for benchmark
+    monitoring.
     """
 
     def __init__(self):
-        """
-        Initialize progress tracker.
-
-        :param enabled: Whether to enable progress tracking and display.
-        """
-        self.profile: Profile = None
-        self.current_strategy: SchedulingStrategy = None
+        """Initialize progress tracker with default state."""
+        self.profile: Profile | None = None
+        self.current_strategy: SchedulingStrategy | None = None
 
     @abstractmethod
     async def on_initialize(self, profile: Profile):
         """
-        Initialize progress tracking for benchmark profile.
+        Initialize progress tracking for the given benchmark profile.
 
-        :param profile: Benchmark profile configuration.
+        :param profile: Benchmark profile configuration defining execution parameters
         """
 
     @abstractmethod
     async def on_benchmark_start(self, strategy: SchedulingStrategy):
         """
-        Handle start of new benchmark strategy execution.
+        Handle benchmark strategy execution start event.
 
-        :param strategy: Scheduling strategy being executed.
+        :param strategy: Scheduling strategy configuration being executed
         """
 
     @abstractmethod
     async def on_benchmark_update(
-        self, estimated_state: EstimatedBenchmarkState, scheduler_state: SchedulerState
+        self, accumulator: BenchmarkAccumulatorT, scheduler_state: SchedulerState
     ):
         """
-        Handle benchmark execution progress update.
+        Handle benchmark execution progress update with current metrics.
 
-        :param estimated_state: Current benchmark metrics and statistics.
-        :param scheduler_state: Current scheduler execution state.
+        :param accumulator: Current accumulated benchmark metrics and statistics
+        :param scheduler_state: Current scheduler execution state and counters
         """
 
     @abstractmethod
     async def on_benchmark_complete(self, benchmark: BenchmarkT):
         """
-        Handle completion of benchmark strategy execution.
+        Handle benchmark strategy execution completion event.
 
-        :param benchmark: Completed benchmark results.
+        :param benchmark: Completed benchmark results with final metrics
         """
 
     @abstractmethod
     async def on_finalize(self):
-        """Finalize progress tracking and cleanup resources."""
+        """Finalize progress tracking and release associated resources."""
 
 
 class GenerativeConsoleBenchmarkerProgress(
-    BenchmarkerProgress[GenerativeBenchmark], Live
+    BenchmarkerProgress[GenerativeBenchmarkAccumulator, GenerativeBenchmark], Live
 ):
     """
-    Console-based progress display for generative benchmarks.
+    Console-based real-time progress display for generative benchmarks.
 
-    Provides real-time visual progress tracking using Rich library components,
-    displaying benchmark execution statistics, timing information, and progress
-    bars in a structured console interface.
+    Renders live benchmark execution statistics using Rich library components with
+    structured progress bars, timing information, request/token metrics, and optional
+    scheduler statistics. Updates refresh automatically during benchmark execution.
+
+    :cvar display_scheduler_stats: Whether to include scheduler statistics in display
     """
 
     def __init__(self, display_scheduler_stats: bool = False):
         """
-        Initialize console progress display.
+        Initialize console progress display with rendering configuration.
 
-        :param enabled: Whether to enable progress tracking and display.
-        :param display_scheduler_stats: Whether to display scheduler statistics.
+        :param display_scheduler_stats: Whether to display scheduler timing statistics
         """
-        BenchmarkerProgress.__init__(self)
+        super().__init__()
         Live.__init__(
             self,
             refresh_per_second=4,
@@ -132,15 +125,15 @@ def __init__(self, display_scheduler_stats: bool = False):
             redirect_stderr=True,
         )
         self.display_scheduler_stats: bool = display_scheduler_stats
-        self.run_progress: Progress = None
-        self.run_progress_task: TaskID = None
-        self.tasks_progress: _GenerativeProgressTasks = None
+        self.run_progress: Progress | None = None
+        self.run_progress_task: TaskID | None = None
+        self.tasks_progress: _GenerativeProgressTasks | None = None
 
     async def on_initialize(self, profile: Profile):
         """
-        Initialize console display components and start rendering.
+        Initialize console display components and begin live rendering.
 
-        :param profile: Benchmark profile configuration.
+        :param profile: Benchmark profile configuration defining execution parameters
         """
         self.tasks_progress = _GenerativeProgressTasks(
             profile=profile, display_scheduler_stats=self.display_scheduler_stats
@@ -179,41 +172,46 @@ async def on_initialize(self, profile: Profile):
 
     async def on_benchmark_start(self, strategy: SchedulingStrategy):
         """
-        Update display for new benchmark strategy start.
+        Update display for benchmark strategy execution start.
 
-        :param strategy: Scheduling strategy being executed.
+        :param strategy: Scheduling strategy configuration being executed
         """
-        self.tasks_progress.start_benchmark(strategy)
-        self._sync_run_progress()
+        if self.tasks_progress is not None:
+            self.tasks_progress.start_benchmark(strategy)
+            self._sync_run_progress()
 
     async def on_benchmark_update(
         self,
-        aggregator_update: EstimatedBenchmarkState | None,
+        accumulator: GenerativeBenchmarkAccumulator,
         scheduler_state: SchedulerState,
     ):
         """
-        Update display with current benchmark progress.
+        Update display with current benchmark progress and metrics.
 
-        :param aggregator_update: Current benchmark metrics and statistics.
-        :param scheduler_state: Current scheduler execution state.
+        :param accumulator: Current accumulated benchmark metrics and statistics
+        :param scheduler_state: Current scheduler execution state and counters
         """
-        self.tasks_progress.update_benchmark(aggregator_update, scheduler_state)
-        self._sync_run_progress()
+        if self.tasks_progress is not None:
+            self.tasks_progress.update_benchmark(accumulator, scheduler_state)
+            self._sync_run_progress()
 
     async def on_benchmark_complete(self, benchmark: GenerativeBenchmark):
         """
-        Update display for completed benchmark.
+        Update display for completed benchmark strategy.
 
-        :param benchmark: Completed benchmark results.
+        :param benchmark: Completed benchmark results with final metrics
         """
-        self.tasks_progress.complete_benchmark(benchmark)
-        self._sync_run_progress()
+        if self.tasks_progress is not None:
+            self.tasks_progress.complete_benchmark(benchmark)
+            self._sync_run_progress()
 
     async def on_finalize(self):
-        """Stop display rendering and cleanup resources."""
-        self.tasks_progress.finalize()
-        self._sync_run_progress()
-        self.run_progress.stop_task(self.run_progress_task)
+        """Stop display rendering and release resources."""
+        if self.tasks_progress is not None:
+            self.tasks_progress.finalize()
+            self._sync_run_progress()
+        if self.run_progress is not None and self.run_progress_task is not None:
+            self.run_progress.stop_task(self.run_progress_task)
         self.stop()
         self.run_progress = None
         self.run_progress_task = None
@@ -221,13 +219,18 @@ async def on_finalize(self):
 
     def _sync_run_progress(self):
         """Synchronize overall progress display with task progress."""
-        self.run_progress.update(
-            self.run_progress_task,
-            total=self.tasks_progress.steps_total,
-            completed=self.tasks_progress.steps_progress,
-            completed_benchmarks=self.tasks_progress.tasks_progress,
-            total_benchmarks=self.tasks_progress.tasks_total,
-        )
+        if (
+            self.run_progress is not None
+            and self.run_progress_task is not None
+            and self.tasks_progress is not None
+        ):
+            self.run_progress.update(
+                self.run_progress_task,
+                total=self.tasks_progress.steps_total,
+                completed=self.tasks_progress.steps_progress,
+                completed_benchmarks=self.tasks_progress.tasks_progress,
+                total_benchmarks=self.tasks_progress.tasks_total,
+            )
 
 
 # Scaling factor for progress calculations to provide granular progress updates
@@ -283,7 +286,7 @@ def steps_progress(self) -> int:
         )
         progress_total = self.current_index + (progress_current_task or 0)
 
-        return progress_total * _PROGRESS_SCALE
+        return int(progress_total * _PROGRESS_SCALE)
 
     def start_benchmark(self, strategy: SchedulingStrategy):
         self.current_index += 1
@@ -294,32 +297,36 @@ def start_benchmark(self, strategy: SchedulingStrategy):
             task_state.task_id = task_id
             self.benchmark_task_states.append(task_state)
 
-        self.benchmark_task_states[self.current_index].start(strategy)
-        self.update(
-            self.benchmark_task_states[self.current_index].task_id,
-            start=True,
-            **self.benchmark_task_states[self.current_index].current,
-        )
+        current_state = self.benchmark_task_states[self.current_index]
+        current_state.start(strategy)
+        if current_state.task_id is not None:
+            self.update(
+                current_state.task_id,
+                start=True,
+                **current_state.current,
+            )
 
     def update_benchmark(
         self,
-        aggregator_update: EstimatedBenchmarkState,
+        accumulator: GenerativeBenchmarkAccumulator,
         scheduler_state: SchedulerState,
     ):
-        self.benchmark_task_states[self.current_index].update(
-            aggregator_update, scheduler_state
-        )
-        self.update(
-            self.benchmark_task_states[self.current_index].task_id,
-            **self.benchmark_task_states[self.current_index].current,
-        )
+        current_state = self.benchmark_task_states[self.current_index]
+        current_state.update(accumulator, scheduler_state)
+        if current_state.task_id is not None:
+            self.update(
+                current_state.task_id,
+                **current_state.current,
+            )
 
     def complete_benchmark(self, benchmark: GenerativeBenchmark):
-        self.benchmark_task_states[self.current_index].complete(benchmark)
-        self.update(
-            self.benchmark_task_states[self.current_index].task_id,
-            **self.benchmark_task_states[self.current_index].current,
-        )
+        current_state = self.benchmark_task_states[self.current_index]
+        current_state.complete(benchmark)
+        if current_state.task_id is not None:
+            self.update(
+                current_state.task_id,
+                **current_state.current,
+            )
 
     def finalize(self):
         self.stop()
@@ -327,29 +334,29 @@ def finalize(self):
 
 @dataclass
 class _GenerativeProgressTaskState:
-    strategy_type: StrategyType
-    task_id: TaskID = None
+    strategy_type: str
+    task_id: TaskID | None = None
     strategy: SchedulingStrategy | None = None
     benchmark_status: Literal[
-        "pending", "in_warmup", "in_progress", "in_cooldown", "completed"
+        "pending", "warmup", "active", "cooldown", "completed"
     ] = "pending"
     progress: float | None = None
     start_time: float = -1.0
     successful_requests: int = 0
     cancelled_requests: int = 0
     errored_requests: int = 0
-    request_concurrency: int = 0
-    requests_per_second: float = 0
-    request_latency: float = 0
-    output_tokens: int = 0
-    output_tokens_rate: float = 0
-    prompt_tokens: int = 0
-    total_tokens_rate: float = 0
-    time_to_first_token: float = 0
-    inter_token_latency: float = 0
-    queued_time: float = 0
-    request_targeted_start_delay: float = 0
-    scheduler_overheads_time: float = 0
+    request_concurrency: float = 0.0
+    requests_per_second: float = 0.0
+    request_latency: float = 0.0
+    output_tokens: float = 0
+    output_tokens_rate: float = 0.0
+    prompt_tokens: float = 0
+    total_tokens_rate: float = 0.0
+    time_to_first_token: float = 0.0
+    inter_token_latency: float = 0.0
+    queued_time: float = 0.0
+    request_targeted_start_delay: float = 0.0
+    scheduler_overheads_time: float = 0.0
 
     @property
     def current(self) -> dict[str, Any]:
@@ -367,12 +374,12 @@ def current(self) -> dict[str, Any]:
     @property
     def completed(self) -> float:
         if self.benchmark_status == "pending":
-            return 0
+            return 0.0
 
         if self.benchmark_status == "completed":
-            return _PROGRESS_SCALE
+            return float(_PROGRESS_SCALE)
 
-        return self.progress * _PROGRESS_SCALE if self.progress is not None else None
+        return self.progress * _PROGRESS_SCALE if self.progress is not None else 0.0
 
     @property
     def total(self) -> float:
@@ -387,13 +394,13 @@ def formatted_start_time(self) -> str:
 
     @property
     def formatted_progress_status(self) -> str:
-        if self.benchmark_status == "in_warmup":
+        if self.benchmark_status == "warmup":
             status = "warmup"
             color = Colors.progress
-        elif self.benchmark_status == "in_progress":
+        elif self.benchmark_status == "active":
             status = "running"
             color = Colors.progress
-        elif self.benchmark_status == "in_cooldown":
+        elif self.benchmark_status == "cooldown":
             status = "cooldown"
             color = Colors.progress
         elif self.benchmark_status == "completed":
@@ -560,7 +567,7 @@ def start(self, strategy: SchedulingStrategy):
 
     def update(
         self,
-        estimated_state: EstimatedBenchmarkState,
+        accumulator: GenerativeBenchmarkAccumulator,
         scheduler_state: SchedulerState,
     ):
         self.progress = (
@@ -569,76 +576,40 @@ def update(
             else 0.0
         )
         self._update_processing_states(
-            benchmark_status=estimated_state.get_metric(
-                group=EstimatedBenchmarkState.benchmark_state_group,
-                key="status",
-                default=None,
-            ),
-            start_time=scheduler_state.start_time,
+            benchmark_status=self._map_status(accumulator.timings.status),
+            start_time=accumulator.timings.measure_start,
             successful_requests=scheduler_state.successful_requests,
             cancelled_requests=scheduler_state.cancelled_requests,
             errored_requests=scheduler_state.errored_requests,
         )
         self._update_request_stats(
-            request_concurrency=estimated_state.get_metric(
-                group=EstimatedBenchmarkState.benchmark_metrics_group,
-                key="concurrency_requests",
-            ),
-            requests_per_second=estimated_state.get_metric(
-                group=EstimatedBenchmarkState.benchmark_metrics_group,
-                key="completed_requests_per_second",
-            ),
-            request_latency=estimated_state.get_metric(
-                group=EstimatedBenchmarkState.benchmark_metrics_group,
-                key="completed_request_latency",
-            ),
+            request_concurrency=accumulator.concurrency_metric.time_weighted_mean,
+            requests_per_second=accumulator.completed_metrics.requests.rate_per_second,
+            request_latency=accumulator.completed_metrics.request_latency.mean,
         )
         self._update_token_stats(
-            output_tokens=estimated_state.get_metric(
-                group=EstimatedBenchmarkState.benchmark_metrics_group,
-                key="completed_output_tokens_total",
-            ),
-            output_tokens_rate=estimated_state.get_metric(
-                group=EstimatedBenchmarkState.benchmark_metrics_group,
-                key="completed_output_tokens",
-            ),
-            prompt_tokens=estimated_state.get_metric(
-                group=EstimatedBenchmarkState.benchmark_metrics_group,
-                key="completed_input_tokens_total",
-            ),
-            total_tokens_rate=estimated_state.get_metric(
-                group=EstimatedBenchmarkState.benchmark_metrics_group,
-                key="completed_total_tokens",
-            ),
-            time_to_first_token=estimated_state.get_metric(
-                group=EstimatedBenchmarkState.benchmark_metrics_group,
-                key="completed_time_to_first_token",
-            ),
-            inter_token_latency=estimated_state.get_metric(
-                group=EstimatedBenchmarkState.benchmark_metrics_group,
-                key="completed_inter_token_latency",
-            ),
+            output_tokens=accumulator.completed_metrics.total_tokens.mean,
+            output_tokens_rate=accumulator.completed_metrics.output_tokens.rate_per_second,
+            prompt_tokens=accumulator.completed_metrics.input_tokens.mean,
+            total_tokens_rate=accumulator.completed_metrics.total_tokens.rate_per_second,
+            time_to_first_token=accumulator.completed_metrics.time_to_first_token_ms.mean,
+            inter_token_latency=accumulator.completed_metrics.inter_token_latency_ms.mean,
+            converted=True,
+        )
+        self._update_system_stats(
+            request_targeted_start_delay=accumulator.scheduler_metrics.request_targeted_start_delay.mean,
+            queued_time=accumulator.scheduler_metrics.queued_time.mean,
+            scheduler_overheads_time=accumulator.scheduler_metrics.resolve_end_delay.mean,
+            converted=False,
         )
-        if estimated_state.get("updated_scheduler_stats"):
-            self._update_system_stats(
-                request_targeted_start_delay=estimated_state.get_metric(
-                    group=EstimatedBenchmarkState.scheduler_state_group,
-                    key="request_targeted_start_delay",
-                ),
-                queued_time=estimated_state.get_metric(
-                    group=EstimatedBenchmarkState.scheduler_state_group,
-                    key="queued_time",
-                ),
-                scheduler_overheads_time=0.0,  # Need to add up metrics here
-            )
 
     def complete(self, benchmark: GenerativeBenchmark):
         self._update_processing_states(
             benchmark_status="completed",
             start_time=benchmark.start_time,
-            successful_requests=benchmark.request_totals.successful,
-            cancelled_requests=benchmark.request_totals.incomplete,
-            errored_requests=benchmark.request_totals.errored,
+            successful_requests=benchmark.metrics.request_totals.successful,
+            cancelled_requests=benchmark.metrics.request_totals.incomplete,
+            errored_requests=benchmark.metrics.request_totals.errored,
         )
         self._update_request_stats(
             request_concurrency=benchmark.metrics.request_concurrency.successful.mean,
@@ -659,11 +630,19 @@ def complete(self, benchmark: GenerativeBenchmark):
             converted=True,
         )
 
+    @staticmethod
+    def _map_status(
+        status: Literal["pending", "warmup", "active", "cooldown"],
+    ) -> Literal["pending", "warmup", "active", "cooldown", "completed"]:
+        """Map accumulator status to internal progress status representation."""
+        return status
+
     def _update_processing_states(
         self,
         benchmark_status: Literal[
-            "pending", "in_warmup", "in_progress", "in_cooldown", "completed"
-        ],
+            "pending", "warmup", "active", "cooldown", "completed"
+        ]
+        | None = None,
         start_time: float | None = None,
         successful_requests: int | None = None,
         cancelled_requests: int | None = None,
@@ -682,7 +661,7 @@ def _update_processing_states(
 
     def _update_request_stats(
         self,
-        request_concurrency: int | None = None,
+        request_concurrency: float | None = None,
         requests_per_second: float | None = None,
         request_latency: float | None = None,
     ):
@@ -695,9 +674,9 @@ def _update_request_stats(
 
     def _update_token_stats(
         self,
-        output_tokens: int | None = None,
+        output_tokens: float | None = None,
         output_tokens_rate: float | None = None,
-        prompt_tokens: int | None = None,
+        prompt_tokens: float | None = None,
         total_tokens_rate: float | None = None,
         time_to_first_token: float | None = None,
         inter_token_latency: float | None = None,
diff --git a/src/guidellm/benchmark/scenarios/chat.json b/src/guidellm/benchmark/scenarios/chat.json
index 58fd18e2..a4137147 100644
--- a/src/guidellm/benchmark/scenarios/chat.json
+++ b/src/guidellm/benchmark/scenarios/chat.json
@@ -3,4 +3,4 @@
     "data": [
         "prompt_tokens=512,prompt_tokens_stdev=128,prompt_tokens_min=1,prompt_tokens_max=1024,output_tokens=256,output_tokens_stdev=64,output_tokens_min=1,output_tokens_max=1024"
     ]
-}
\ No newline at end of file
+}
diff --git a/src/guidellm/benchmark/scenarios/rag.json b/src/guidellm/benchmark/scenarios/rag.json
index ea38d76e..0a82e9e9 100644
--- a/src/guidellm/benchmark/scenarios/rag.json
+++ b/src/guidellm/benchmark/scenarios/rag.json
@@ -3,4 +3,4 @@
     "data": [
         "prompt_tokens=4096,prompt_tokens_stdev=512,prompt_tokens_min=2048,prompt_tokens_max=6144,output_tokens=512,output_tokens_stdev=128,output_tokens_min=1,output_tokens_max=1024"
     ]
-}
\ No newline at end of file
+}
diff --git a/src/guidellm/benchmark/schemas.py b/src/guidellm/benchmark/schemas.py
deleted file mode 100644
index b2fd15f5..00000000
--- a/src/guidellm/benchmark/schemas.py
+++ /dev/null
@@ -1,2128 +0,0 @@
-"""
-Benchmark data models and metrics for generative AI performance measurement.
-
-Provides comprehensive data structures for capturing, storing, and analyzing
-benchmark results from scheduler-driven generative AI workload executions.
-Core abstractions include base benchmark interfaces, generative-specific
-metrics with token/latency distributions, request-level statistics tracking,
-and multi-benchmark reporting capabilities. These models enable detailed
-performance analysis including throughput, latency, concurrency patterns, and
-domain-specific metrics for text, image, video, and audio generation tasks.
-"""
-
-from __future__ import annotations
-
-import inspect
-import json
-import random
-import time
-import uuid
-from abc import ABC, abstractmethod
-from collections.abc import Callable, Iterable
-from pathlib import Path
-from typing import Any, ClassVar, Literal, TypeVar, cast
-
-import yaml
-from pydantic import (
-    AliasChoices,
-    AliasGenerator,
-    ConfigDict,
-    Field,
-    ValidationError,
-    ValidatorFunctionWrapHandler,
-    computed_field,
-    field_validator,
-    model_serializer,
-)
-from torch.utils.data import Sampler
-from transformers import PreTrainedTokenizerBase
-
-from guidellm.backends import Backend, BackendType
-from guidellm.benchmark.profile import Profile, ProfileType
-from guidellm.benchmark.scenarios import get_builtin_scenarios
-from guidellm.data import DatasetPreprocessor
-from guidellm.scheduler import (
-    BackendInterface,
-    Environment,
-    SchedulerState,
-    SchedulingStrategy,
-    StrategyType,
-)
-from guidellm.schemas import (
-    GenerationRequest,
-    GenerationResponse,
-    GenerativeRequestStats,
-    RequestInfo,
-    UsageMetrics,
-)
-from guidellm.utils import (
-    InfoMixin,
-    StandardBaseDict,
-    StandardBaseModel,
-    StatusBreakdown,
-    StatusDistributionSummary,
-)
-
-__all__ = [
-    "Benchmark",
-    "BenchmarkGenerativeTextArgs",
-    "BenchmarkSchedulerStats",
-    "BenchmarkT",
-    "BenchmarkerArgs",
-    "BenchmarkerDict",
-    "EstimatedBenchmarkState",
-    "GenerativeAudioMetricsSummary",
-    "GenerativeBenchmark",
-    "GenerativeBenchmarksReport",
-    "GenerativeImageMetricsSummary",
-    "GenerativeMetrics",
-    "GenerativeMetricsSummary",
-    "GenerativeTextMetricsSummary",
-    "GenerativeVideoMetricsSummary",
-    "SchedulerDict",
-]
-
-
-class EstimatedBenchmarkState(dict[str, Any]):
-    """
-    Accumulator for real-time benchmark metrics during scheduler execution.
-
-    Tracks incremental metrics, running averages, and time-based statistics as
-    requests are processed. Maintains grouped metrics for benchmark state,
-    benchmark-level metrics, and scheduler-level metrics with support for
-    average, rate, and time-averaged metric calculations.
-
-    :cvar benchmark_state_group: Metric group key for benchmark state tracking
-    :cvar benchmark_metrics_group: Metric group key for benchmark-level metrics
-    :cvar scheduler_state_group: Metric group key for scheduler-level metrics
-    """
-
-    benchmark_state_group: ClassVar[Literal["benchmark_state"]] = "benchmark_state"
-    benchmark_metrics_group: ClassVar[Literal["benchmark_metrics"]] = (
-        "benchmark_metrics"
-    )
-    scheduler_state_group: ClassVar[Literal["scheduler_state"]] = "scheduler_state"
-
-    def get_metric(
-        self,
-        group: str,
-        key: str,
-        default: int | float | None = None,
-    ) -> int | float | None:
-        """
-        Retrieve a grouped metric value by group and key.
-
-        :param group: Metric group identifier
-        :param key: Metric key within the group
-        :param default: Value returned if metric doesn't exist
-        :return: The metric value or default if not found
-        """
-        return self.get(f"{group}_{key}", default)
-
-    def set_metric(
-        self,
-        group: str,
-        key: str,
-        value: bool | int | float | None,
-        start_val: bool | int | float | None = None,
-    ) -> bool | int | float | None:
-        """
-        Set a grouped metric value, optionally adjusting by a starting value.
-
-        :param group: Metric group identifier
-        :param key: Metric key within the group
-        :param value: Metric value to set
-        :param start_val: Optional starting value to subtract from the metric value
-        :return: The adjusted metric value or None if value is None
-        """
-        if value is None:
-            return None
-
-        if start_val is not None:
-            value -= start_val
-        self[f"{group}_{key}"] = value
-
-        return value
-
-    def add_avg_metric(
-        self,
-        group: str,
-        key: str,
-        value: bool | int | float | None,
-        start_val: bool | int | float | None = 0.0,
-        count: int | None = 1,
-    ):
-        """
-        Add a value to a running average metric calculation.
-
-        :param group: Metric group identifier
-        :param key: Metric key within the group
-        :param value: Value to add to the average
-        :param start_val: Optional starting value to subtract before adding
-        :param count: Number of observations this value represents
-        """
-        if value is None or count is None:
-            return
-
-        if start_val is not None:
-            value -= start_val
-
-        total_key = f"{group}_{key}_total"
-        count_key = f"{group}_{key}_count"
-        self[total_key] = self.get(total_key, 0) + value
-        self[count_key] = self.get(count_key, 0) + count
-
-        average = self[total_key] / self[count_key] if self[count_key] > 0 else 0.0
-        self.set_metric(
-            group=group,
-            key=key,
-            value=average,
-        )
-
-    def add_avg_rate_metric(
-        self,
-        group: str,
-        key: str,
-        value: bool | int | float | None,
-        start_val: bool | int | float | None = 0.0,
-        start_time: float | None = None,
-        end_time: float | None = None,
-        numerator_type: Literal["avg", "total", "count"] = "total",
-    ):
-        """
-        Add a value to a rate-based average metric calculation.
-
-        :param group: Metric group identifier
-        :param key: Metric key within the group
-        :param value: Value to add to the average
-        :param start_val: Optional starting value to subtract before adding
-        :param start_time: Start time for rate calculation, defaults to current time
-        :param end_time: End time for rate calculation, defaults to current time
-        :param numerator_type: Type of numerator for rate calculation
-        """
-        if value is None:
-            return
-
-        self.add_avg_metric(
-            group=group,
-            key=key,
-            value=value,
-            start_val=start_val,
-        )
-        start_time_key = f"{group}_{key}_start_time"
-        if self.get(start_time_key) is None:
-            if start_time is None:
-                start_time = time.time()
-            self[start_time_key] = start_time
-        else:
-            self[start_time_key] = start_time or self[start_time_key]
-
-        end_time = end_time or time.time()
-        elapsed_time = end_time - self[start_time_key]
-
-        if elapsed_time > 0:
-            numerator_key = (
-                f"{group}_{key}_{numerator_type}"
-                if numerator_type != "avg"
-                else f"{group}_{key}"
-            )
-            rate = self[numerator_key] / elapsed_time
-            self.set_metric(
-                group=group,
-                key=f"{key}_per_second",
-                value=rate,
-            )
-
-    def add_time_averaged_metric(
-        self,
-        group: str,
-        key: str,
-        value: bool | int | float | None,
-        recorded_time: float | None = None,
-    ):
-        """
-        Add a value to a time-weighted average metric calculation.
-
-        :param group: Metric group identifier
-        :param key: Metric key within the group
-        :param value: Value to add to the time-weighted average
-        :param recorded_time: Time of the observation, defaults to current time
-        """
-        if value is None:
-            return
-
-        if recorded_time is None:
-            recorded_time = time.time()
-
-        time_avg_numerator_key = f"{group}_{key}_time_avg_numerator"
-        time_avg_denominator_key = f"{group}_{key}_time_avg_denominator"
-        last_recorded_time_key = f"{group}_{key}_last_recorded_time"
-        last_recorded_value_key = f"{group}_{key}_last_recorded_value"
-
-        if last_recorded_time_key not in self:
-            self[last_recorded_time_key] = recorded_time
-            self[last_recorded_value_key] = value
-            self[time_avg_numerator_key] = value
-            self[time_avg_denominator_key] = 0.0
-        else:
-            time_delta = recorded_time - self[last_recorded_time_key]
-            self[time_avg_numerator_key] += self[last_recorded_value_key] * time_delta
-            self[time_avg_denominator_key] += time_delta
-            self[last_recorded_time_key] = recorded_time
-            self[last_recorded_value_key] = value
-
-        if self[time_avg_denominator_key] > 0:
-            average = self[time_avg_numerator_key] / self[time_avg_denominator_key]
-        else:
-            average = value
-
-        self.set_metric(
-            group=group,
-            key=key,
-            value=average,
-        )
-
-
-class BenchmarkerArgs(StandardBaseDict):
-    """
-    Configuration parameters for benchmark execution and request sampling.
-
-    Defines run identification, request sampling strategy, warmup/cooldown phases,
-    and metric preferences for benchmark executions. Provides methods to determine
-    whether a request falls within warmup or cooldown periods based on time,
-    request count, or percentage-based thresholds.
-    """
-
-    run_id: str = Field(
-        default_factory=lambda: str(uuid.uuid4()),
-        description="Unique identifier for the benchmark run",
-    )
-    run_index: int = Field(default=0, description="Index of the benchmark run")
-    sample_requests: int | None = Field(
-        default=20,
-        description=(
-            "Number of requests to sample and keep in the final benchmark for metrics"
-        ),
-    )
-    warmup: int | float | None = Field(
-        default=None, description="Warmup time before benchmarking starts"
-    )
-    cooldown: int | float | None = Field(
-        default=None, description="Cooldown time after benchmarking ends"
-    )
-    prefer_response_metrics: bool = Field(
-        default=True,
-        description="Whether to prefer response metrics over request metrics",
-    )
-
-    def is_in_warmup(
-        self, request_info: RequestInfo, scheduler_state: SchedulerState
-    ) -> bool:
-        """
-        Check if a request is in the warmup phase.
-
-        :param request_info: Information about the current request
-        :param scheduler_state: Current state of the scheduler
-        :return: True if the request is in warmup phase, False otherwise
-        """
-        if self.warmup is not None and 0 < self.warmup < 1:
-            # Percentage-based warmup
-            return (
-                scheduler_state.remaining_fraction is not None
-                and scheduler_state.remaining_fraction > (1 - self.warmup)
-            )
-
-        if self.warmup is not None and self.warmup > 1:
-            # Count/time-based warmup
-            if scheduler_state.processed_requests < self.warmup:
-                return True
-
-            current_time = request_info.timings.targeted_start
-            return (
-                current_time is not None
-                and (current_time - scheduler_state.start_time) < self.warmup
-            )
-
-        return False
-
-    def is_in_cooldown(
-        self, request_info: RequestInfo, scheduler_state: SchedulerState
-    ) -> bool:
-        """
-        Check if a request is in the cooldown phase.
-
-        :param request_info: Information about the current request
-        :param scheduler_state: Current state of the scheduler
-        :return: True if the request is in cooldown phase, False otherwise
-        """
-        if self.cooldown is not None and 0 < self.cooldown < 1:
-            # Percentage-based cooldown
-            return (
-                scheduler_state.remaining_fraction is not None
-                and scheduler_state.remaining_fraction < self.cooldown
-            )
-
-        if self.cooldown is not None and self.cooldown > 1:
-            # Count/time-based cooldown
-            if (
-                scheduler_state.remaining_requests is not None
-                and scheduler_state.remaining_requests <= self.cooldown
-            ):
-                return True
-
-            current_time = (
-                request_info.timings.resolve_end or request_info.timings.targeted_start
-            )
-            return (
-                current_time is not None
-                and scheduler_state.remaining_duration is not None
-                and scheduler_state.remaining_duration < self.cooldown
-            )
-
-        return False
-
-
-class Benchmark(ABC):
-    """
-    Abstract base interface for benchmark result implementations.
-
-    Defines the contract for benchmark classes to provide run metrics sampling,
-    request metrics sampling, real-time estimate updates, and final compilation
-    of benchmark results from scheduler execution data.
-    """
-
-    @abstractmethod
-    def get_run_metrics_sample(
-        self,
-    ) -> dict[Literal["start_time", "end_time", "duration"], float]:
-        """
-        Get a sample of run-level timing metrics.
-
-        :return: Dictionary containing start_time, end_time, and duration metrics
-        """
-        ...
-
-    @abstractmethod
-    def get_request_metrics_sample(
-        self,
-    ) -> dict[
-        Literal[
-            "request_count",
-            "request_latency",
-            "request_throughput",
-            "request_concurrency",
-        ],
-        float,
-    ]:
-        """
-        Get a sample of request-level performance metrics.
-
-        :return: Dictionary containing request count, latency, throughput, and
-            concurrency metrics
-        """
-        ...
-
-    @classmethod
-    @abstractmethod
-    def update_estimate(
-        cls,
-        args: BenchmarkerArgs,
-        state: EstimatedBenchmarkState,
-        response: Any,
-        request: Any,
-        request_info: RequestInfo,
-        scheduler_state: SchedulerState,
-    ):
-        """
-        Update real-time benchmark estimates with new request data.
-
-        :param args: Benchmark configuration arguments
-        :param state: Current estimated benchmark state to update
-        :param response: Response received from the backend
-        :param request: Original request sent to the backend
-        :param request_info: Metadata about the request execution
-        :param scheduler_state: Current state of the scheduler
-        """
-        ...
-
-    @classmethod
-    @abstractmethod
-    def compile(
-        cls,
-        args: BenchmarkerArgs,
-        estimated_state: EstimatedBenchmarkState,
-        scheduler_state: SchedulerState,
-        profile: Profile,
-        requests: Iterable,
-        backend: BackendInterface,
-        environment: Environment,
-        strategy: SchedulingStrategy,
-        constraints: dict[str, dict[str, Any]],
-    ) -> Any:
-        """
-        Compile final benchmark results from accumulated state.
-
-        :param args: Benchmark configuration arguments
-        :param estimated_state: Accumulated benchmark state from execution
-        :param scheduler_state: Final state of the scheduler
-        :param profile: Benchmark profile configuration
-        :param requests: Collection of requests executed
-        :param backend: Backend interface used for execution
-        :param environment: Execution environment configuration
-        :param strategy: Scheduling strategy used
-        :param constraints: Execution constraints applied
-        :return: Compiled benchmark results instance
-        """
-        ...
-
-
-BenchmarkT = TypeVar("BenchmarkT", bound=Benchmark)
-
-
-class BenchmarkSchedulerStats(StandardBaseDict):
-    """Scheduler timing and performance statistics."""
-
-    group_name: ClassVar[Literal["scheduler_stats"]] = "scheduler_stats"
-
-    start_time: float = Field(
-        description="Unix timestamp when the benchmark run started"
-    )
-    end_time: float = Field(description="Unix timestamp when the benchmark run ended")
-    requests_made: StatusBreakdown[int, int, int, int] = Field(
-        description="Request counts by status: successful, incomplete, errored, total"
-    )
-    queued_time_avg: float = Field(
-        description="Avg time requests spent in the queue (seconds)"
-    )
-    worker_resolve_start_delay_avg: float = Field(
-        description="Avg delay before worker begins resolving req after dequeue (sec)"
-    )
-    worker_resolve_time_avg: float = Field(
-        description="Avg time for worker to resolve requests (seconds)"
-    )
-    worker_resolve_end_delay_avg: float = Field(
-        description="Avg delay after request end till worker resolves (seconds)"
-    )
-    finalized_delay_avg: float = Field(
-        description="Avg delay after resolve til finalized with in scheduler (sec)"
-    )
-    worker_targeted_start_delay_avg: float = Field(
-        description="Avg delay from targeted start to actual worker start (seconds)"
-    )
-    request_start_delay_avg: float = Field(
-        description="Avg delay after resolve til request start (seconds)"
-    )
-    request_time_avg: float = Field(description="Avg request processing time (seconds)")
-    request_targeted_start_delay_avg: float = Field(
-        description="Avg delay from targeted start to actual request start"
-    )
-
-    @classmethod
-    def update_estimate(cls, state: EstimatedBenchmarkState, request_info: RequestInfo):
-        """
-        Update estimated scheduler statistics with request timing information.
-
-        :param state: Current estimated benchmark state to update
-        :param request_info: Metadata about the request execution with timing data
-        """
-        state.set_metric(group=cls.group_name, key="updated", value=True)
-        state.add_avg_metric(
-            group=cls.group_name,
-            key="queued_time",
-            value=request_info.timings.dequeued,
-            start_val=request_info.timings.queued,
-        )
-        state.add_avg_metric(
-            group=cls.group_name,
-            key="worker_resolve_start_delay",
-            value=request_info.timings.resolve_start,
-            start_val=request_info.timings.scheduled_at,
-        )
-        state.add_avg_metric(
-            group=cls.group_name,
-            key="worker_resolve_time",
-            value=request_info.timings.resolve_end,
-            start_val=request_info.timings.resolve_start,
-        )
-        state.add_avg_metric(
-            group=cls.group_name,
-            key="worker_resolve_end_delay",
-            value=request_info.timings.request_end,
-            start_val=request_info.timings.resolve_end,
-        )
-        state.add_avg_metric(
-            group=cls.group_name,
-            key="finalized_delay",
-            value=request_info.timings.finalized,
-            start_val=request_info.timings.resolve_end,
-        )
-        state.add_avg_metric(
-            group=cls.group_name,
-            key="worker_targeted_start_delay",
-            value=request_info.timings.resolve_start,
-            start_val=request_info.timings.targeted_start,
-        )
-        state.add_avg_metric(
-            group=cls.group_name,
-            key="request_start_delay",
-            value=request_info.timings.request_start,
-            start_val=request_info.timings.resolve_start,
-        )
-        state.add_avg_metric(
-            group=cls.group_name,
-            key="request_time",
-            value=request_info.timings.request_end,
-            start_val=request_info.timings.request_start,
-        )
-        state.add_avg_metric(
-            group=cls.group_name,
-            key="request_targeted_start_delay",
-            value=request_info.timings.request_start,
-            start_val=request_info.timings.targeted_start,
-        )
-
-    @classmethod
-    def compile(
-        cls, estimated_state: EstimatedBenchmarkState, scheduler_state: SchedulerState
-    ) -> BenchmarkSchedulerStats:
-        """
-        Compile final scheduler statistics from accumulated state.
-
-        :param estimated_state: Accumulated benchmark state with scheduler metrics
-        :param scheduler_state: Final state of the scheduler
-        :return: Compiled scheduler statistics instance
-        """
-        return BenchmarkSchedulerStats(
-            start_time=scheduler_state.start_time,
-            end_time=scheduler_state.end_time or scheduler_state.start_time,
-            requests_made=StatusBreakdown[int, int, int, int](
-                successful=scheduler_state.successful_requests,
-                incomplete=scheduler_state.cancelled_requests,
-                errored=scheduler_state.errored_requests,
-                total=(
-                    scheduler_state.successful_requests
-                    + scheduler_state.cancelled_requests
-                    + scheduler_state.errored_requests
-                ),
-            ),
-            queued_time_avg=cast(
-                "float",
-                estimated_state.get_metric(
-                    group=cls.group_name, key="queued_time", default=-1.0
-                ),
-            ),
-            worker_resolve_start_delay_avg=cast(
-                "float",
-                estimated_state.get_metric(
-                    group=cls.group_name, key="worker_resolve_start_delay", default=-1.0
-                ),
-            ),
-            worker_resolve_time_avg=cast(
-                "float",
-                estimated_state.get_metric(
-                    group=cls.group_name, key="worker_resolve_time", default=-1.0
-                ),
-            ),
-            worker_resolve_end_delay_avg=cast(
-                "float",
-                estimated_state.get_metric(
-                    group=cls.group_name, key="worker_resolve_end_delay", default=-1.0
-                ),
-            ),
-            finalized_delay_avg=cast(
-                "float",
-                estimated_state.get_metric(
-                    group=cls.group_name, key="finalized_delay", default=-1.0
-                ),
-            ),
-            worker_targeted_start_delay_avg=cast(
-                "float",
-                estimated_state.get_metric(
-                    group=cls.group_name,
-                    key="worker_targeted_start_delay",
-                    default=-1.0,
-                ),
-            ),
-            request_start_delay_avg=cast(
-                "float",
-                estimated_state.get_metric(
-                    group=cls.group_name, key="request_start_delay", default=-1.0
-                ),
-            ),
-            request_time_avg=cast(
-                "float",
-                estimated_state.get_metric(
-                    group=cls.group_name, key="request_time", default=-1.0
-                ),
-            ),
-            request_targeted_start_delay_avg=cast(
-                "float",
-                estimated_state.get_metric(
-                    group=cls.group_name,
-                    key="request_targeted_start_delay",
-                    default=-1.0,
-                ),
-            ),
-        )
-
-
-class GenerativeMetricsSummary(StandardBaseDict):
-    """
-    Statistical summaries for input, output, and total metrics.
-
-    Provides distribution summaries across successful, incomplete, and errored
-    requests for absolute values, per-second rates, and concurrency levels.
-    """
-
-    input: StatusDistributionSummary = Field(
-        description="Distribution of input metric values"
-    )
-    input_per_second: StatusDistributionSummary = Field(
-        description="Distribution of input metric rates per second"
-    )
-    input_concurrency: StatusDistributionSummary = Field(
-        description="Distribution of concurrent input metric values"
-    )
-
-    output: StatusDistributionSummary = Field(
-        description="Distribution of output metric values"
-    )
-    output_per_second: StatusDistributionSummary = Field(
-        description="Distribution of output metric rates per second"
-    )
-    output_concurrency: StatusDistributionSummary = Field(
-        description="Distribution of concurrent output metric values"
-    )
-
-    total: StatusDistributionSummary = Field(
-        description="Distribution of total metric values (input + output)"
-    )
-    total_per_second: StatusDistributionSummary = Field(
-        description="Distribution of total metric rates per second"
-    )
-    total_concurrency: StatusDistributionSummary = Field(
-        description="Distribution of concurrent total metric values"
-    )
-
-    @classmethod
-    def compile(
-        cls,
-        request_types: list[Literal["successful", "incomplete", "error"]],
-        request_times: list[tuple[float, float]],
-        input_values: list[int | float],
-        output_values: list[int | float],
-    ) -> GenerativeMetricsSummary:
-        """
-        Compile generative metrics summary from request data.
-
-        :param request_types: Status types for each request
-        :param request_times: Start and end times for each request
-        :param input_values: Input metric values for each request
-        :param output_values: Output metric values for each request
-        :return: Compiled generative metrics summary
-        """
-        total_values = [
-            input_val + output_val
-            for input_val, output_val in zip(input_values, output_values, strict=False)
-        ]
-
-        return GenerativeMetricsSummary(
-            input=StatusDistributionSummary.from_values(
-                value_types=request_types,
-                values=input_values,
-            ),
-            input_per_second=StatusDistributionSummary.from_request_times(
-                request_types=request_types,
-                requests=request_times,
-                distribution_type="rate",
-                weights=input_values,
-            ),
-            input_concurrency=StatusDistributionSummary.from_request_times(
-                request_types=request_types,
-                requests=request_times,
-                distribution_type="concurrency",
-                weights=input_values,
-            ),
-            output=StatusDistributionSummary.from_values(
-                value_types=request_types,
-                values=output_values,
-            ),
-            output_per_second=StatusDistributionSummary.from_request_times(
-                request_types=request_types,
-                requests=request_times,
-                distribution_type="rate",
-                weights=output_values,
-            ),
-            output_concurrency=StatusDistributionSummary.from_request_times(
-                request_types=request_types,
-                requests=request_times,
-                distribution_type="concurrency",
-                weights=output_values,
-            ),
-            total=StatusDistributionSummary.from_values(
-                value_types=request_types,
-                values=total_values,
-            ),
-            total_per_second=StatusDistributionSummary.from_request_times(
-                request_types=request_types,
-                requests=request_times,
-                distribution_type="rate",
-                weights=total_values,
-            ),
-            total_concurrency=StatusDistributionSummary.from_request_times(
-                request_types=request_types,
-                requests=request_times,
-                distribution_type="concurrency",
-                weights=total_values,
-            ),
-        )
-
-
-class GenerativeTextMetricsSummary(StandardBaseDict):
-    """
-    Text-specific metric summaries for generative benchmarks.
-
-    Tracks token, word, and character-level metrics across input, output, and
-    total usage for text generation workloads.
-    """
-
-    tokens: GenerativeMetricsSummary = Field(
-        description="Token count metrics and distributions"
-    )
-    words: GenerativeMetricsSummary = Field(
-        description="Word count metrics and distributions"
-    )
-    characters: GenerativeMetricsSummary = Field(
-        description="Character count metrics and distributions"
-    )
-
-    @classmethod
-    def compile(
-        cls,
-        request_types: list[Literal["successful", "incomplete", "error"]],
-        request_times: list[tuple[float, float]],
-        input_metrics: list[UsageMetrics],
-        output_metrics: list[UsageMetrics],
-    ) -> GenerativeTextMetricsSummary:
-        """
-        Compile text metrics summary from request usage data.
-
-        :param request_types: Status types for each request
-        :param request_times: Start and end times for each request
-        :param input_metrics: Input usage metrics for each request
-        :param output_metrics: Output usage metrics for each request
-        :return: Compiled text metrics summary
-        """
-        return GenerativeTextMetricsSummary(
-            tokens=GenerativeMetricsSummary.compile(
-                request_types=request_types,
-                request_times=request_times,
-                input_values=[metrics.text_tokens or 0 for metrics in input_metrics],
-                output_values=[metrics.text_tokens or 0 for metrics in output_metrics],
-            ),
-            words=GenerativeMetricsSummary.compile(
-                request_types=request_types,
-                request_times=request_times,
-                input_values=[metrics.text_words or 0 for metrics in input_metrics],
-                output_values=[metrics.text_words or 0 for metrics in output_metrics],
-            ),
-            characters=GenerativeMetricsSummary.compile(
-                request_types=request_types,
-                request_times=request_times,
-                input_values=[
-                    metrics.text_characters or 0 for metrics in input_metrics
-                ],
-                output_values=[
-                    metrics.text_characters or 0 for metrics in output_metrics
-                ],
-            ),
-        )
-
-
-class GenerativeImageMetricsSummary(StandardBaseDict):
-    """
-    Image-specific metric summaries for generative benchmarks.
-
-    Tracks token, image count, pixel, and byte-level metrics across input, output,
-    and total usage for image generation workloads.
-    """
-
-    tokens: GenerativeMetricsSummary = Field(
-        description="Image token count metrics and distributions"
-    )
-    images: GenerativeMetricsSummary = Field(
-        description="Image count metrics and distributions"
-    )
-    pixels: GenerativeMetricsSummary = Field(
-        description="Pixel count metrics and distributions"
-    )
-    bytes: GenerativeMetricsSummary = Field(
-        description="Byte size metrics and distributions"
-    )
-
-    @classmethod
-    def compile(
-        cls,
-        request_types: list[Literal["successful", "incomplete", "error"]],
-        request_times: list[tuple[float, float]],
-        input_metrics: list[UsageMetrics],
-        output_metrics: list[UsageMetrics],
-    ) -> GenerativeImageMetricsSummary:
-        """
-        Compile image metrics summary from request usage data.
-
-        :param request_types: Status types for each request
-        :param request_times: Start and end times for each request
-        :param input_metrics: Input usage metrics for each request
-        :param output_metrics: Output usage metrics for each request
-        :return: Compiled image metrics summary
-        """
-        return GenerativeImageMetricsSummary(
-            tokens=GenerativeMetricsSummary.compile(
-                request_types=request_types,
-                request_times=request_times,
-                input_values=[metrics.image_tokens or 0 for metrics in input_metrics],
-                output_values=[metrics.image_tokens or 0 for metrics in output_metrics],
-            ),
-            images=GenerativeMetricsSummary.compile(
-                request_types=request_types,
-                request_times=request_times,
-                input_values=[metrics.image_count or 0 for metrics in input_metrics],
-                output_values=[metrics.image_count or 0 for metrics in output_metrics],
-            ),
-            pixels=GenerativeMetricsSummary.compile(
-                request_types=request_types,
-                request_times=request_times,
-                input_values=[metrics.image_pixels or 0 for metrics in input_metrics],
-                output_values=[metrics.image_pixels or 0 for metrics in output_metrics],
-            ),
-            bytes=GenerativeMetricsSummary.compile(
-                request_types=request_types,
-                request_times=request_times,
-                input_values=[metrics.image_bytes or 0 for metrics in input_metrics],
-                output_values=[metrics.image_bytes or 0 for metrics in output_metrics],
-            ),
-        )
-
-
-class GenerativeVideoMetricsSummary(StandardBaseDict):
-    """
-    Video-specific metric summaries for generative benchmarks.
-
-    Tracks token, frame count, duration, and byte-level metrics across input,
-    output, and total usage for video generation workloads.
-    """
-
-    tokens: GenerativeMetricsSummary = Field(
-        description="Video token count metrics and distributions"
-    )
-    frames: GenerativeMetricsSummary = Field(
-        description="Frame count metrics and distributions"
-    )
-    seconds: GenerativeMetricsSummary = Field(
-        description="Duration metrics in seconds and distributions"
-    )
-    bytes: GenerativeMetricsSummary = Field(
-        description="Byte size metrics and distributions"
-    )
-
-    @classmethod
-    def compile(
-        cls,
-        request_types: list[Literal["successful", "incomplete", "error"]],
-        request_times: list[tuple[float, float]],
-        input_metrics: list[UsageMetrics],
-        output_metrics: list[UsageMetrics],
-    ) -> GenerativeVideoMetricsSummary:
-        """
-        Compile video metrics summary from request usage data.
-
-        :param request_types: Status types for each request
-        :param request_times: Start and end times for each request
-        :param input_metrics: Input usage metrics for each request
-        :param output_metrics: Output usage metrics for each request
-        :return: Compiled video metrics summary
-        """
-        return GenerativeVideoMetricsSummary(
-            tokens=GenerativeMetricsSummary.compile(
-                request_types=request_types,
-                request_times=request_times,
-                input_values=[metrics.video_tokens or 0 for metrics in input_metrics],
-                output_values=[metrics.video_tokens or 0 for metrics in output_metrics],
-            ),
-            frames=GenerativeMetricsSummary.compile(
-                request_types=request_types,
-                request_times=request_times,
-                input_values=[metrics.video_frames or 0 for metrics in input_metrics],
-                output_values=[metrics.video_frames or 0 for metrics in output_metrics],
-            ),
-            seconds=GenerativeMetricsSummary.compile(
-                request_types=request_types,
-                request_times=request_times,
-                input_values=[metrics.video_seconds or 0 for metrics in input_metrics],
-                output_values=[
-                    metrics.video_seconds or 0 for metrics in output_metrics
-                ],
-            ),
-            bytes=GenerativeMetricsSummary.compile(
-                request_types=request_types,
-                request_times=request_times,
-                input_values=[metrics.video_bytes or 0 for metrics in input_metrics],
-                output_values=[metrics.video_bytes or 0 for metrics in output_metrics],
-            ),
-        )
-
-
-class GenerativeAudioMetricsSummary(StandardBaseDict):
-    """
-    Audio-specific metric summaries for generative benchmarks.
-
-    Tracks token, sample count, duration, and byte-level metrics across input,
-    output, and total usage for audio generation workloads.
-    """
-
-    tokens: GenerativeMetricsSummary = Field(
-        description="Audio token count metrics and distributions"
-    )
-    samples: GenerativeMetricsSummary = Field(
-        description="Sample count metrics and distributions"
-    )
-    seconds: GenerativeMetricsSummary = Field(
-        description="Duration metrics in seconds and distributions"
-    )
-    bytes: GenerativeMetricsSummary = Field(
-        description="Byte size metrics and distributions"
-    )
-
-    @classmethod
-    def compile(
-        cls,
-        request_types: list[Literal["successful", "incomplete", "error"]],
-        request_times: list[tuple[float, float]],
-        input_metrics: list[UsageMetrics],
-        output_metrics: list[UsageMetrics],
-    ) -> GenerativeAudioMetricsSummary:
-        """
-        Compile audio metrics summary from request usage data.
-
-        :param request_types: Status types for each request
-        :param request_times: Start and end times for each request
-        :param input_metrics: Input usage metrics for each request
-        :param output_metrics: Output usage metrics for each request
-        :return: Compiled audio metrics summary
-        """
-        return GenerativeAudioMetricsSummary(
-            tokens=GenerativeMetricsSummary.compile(
-                request_types=request_types,
-                request_times=request_times,
-                input_values=[metrics.audio_tokens or 0 for metrics in input_metrics],
-                output_values=[metrics.audio_tokens or 0 for metrics in output_metrics],
-            ),
-            samples=GenerativeMetricsSummary.compile(
-                request_types=request_types,
-                request_times=request_times,
-                input_values=[metrics.audio_samples or 0 for metrics in input_metrics],
-                output_values=[
-                    metrics.audio_samples or 0 for metrics in output_metrics
-                ],
-            ),
-            seconds=GenerativeMetricsSummary.compile(
-                request_types=request_types,
-                request_times=request_times,
-                input_values=[metrics.audio_seconds or 0 for metrics in input_metrics],
-                output_values=[
-                    metrics.audio_seconds or 0 for metrics in output_metrics
-                ],
-            ),
-            bytes=GenerativeMetricsSummary.compile(
-                request_types=request_types,
-                request_times=request_times,
-                input_values=[metrics.audio_bytes or 0 for metrics in input_metrics],
-                output_values=[metrics.audio_bytes or 0 for metrics in output_metrics],
-            ),
-        )
-
-
-class GenerativeMetrics(StandardBaseDict):
-    """Comprehensive metrics for generative AI benchmarks."""
-
-    # Request stats
-    requests_per_second: StatusDistributionSummary = Field(
-        description="Distribution of requests per second across benchmark execution"
-    )
-    request_concurrency: StatusDistributionSummary = Field(
-        description="Distribution of concurrent request counts during execution"
-    )
-    request_latency: StatusDistributionSummary = Field(
-        description="Distribution of request latencies for completed requests"
-    )
-    request_streaming_iterations_count: StatusDistributionSummary = Field(
-        description="Distribution of stream iterations for completed requests"
-    )
-
-    # General token stats
-    prompt_token_count: StatusDistributionSummary = Field(
-        description="Distribution of prompt token counts by request status"
-    )
-    output_token_count: StatusDistributionSummary = Field(
-        description="Distribution of output token counts by request status"
-    )
-    total_token_count: StatusDistributionSummary = Field(
-        description="Distribution of total token counts by request status"
-    )
-    time_to_first_token_ms: StatusDistributionSummary = Field(
-        description="Distribution of first token latencies in milliseconds"
-    )
-    time_per_output_token_ms: StatusDistributionSummary = Field(
-        description="Distribution of average time per output token in milliseconds"
-    )
-    inter_token_latency_ms: StatusDistributionSummary = Field(
-        description="Distribution of inter-token latencies in milliseconds"
-    )
-    output_tokens_wo_first_per_iteration: StatusDistributionSummary = Field(
-        description=(
-            "Distribution of output tokens (without first) generated per "
-            "streaming iteration"
-        )
-    )
-    output_tokens_per_second: StatusDistributionSummary = Field(
-        description="Distribution of output token generation rates"
-    )
-    output_tokens_per_iteration: StatusDistributionSummary = Field(
-        description="Distribution of output tokens generated per streaming iteration"
-    )
-    tokens_per_second: StatusDistributionSummary = Field(
-        description="Distribution of total token throughput including prompt and output"
-    )
-
-    # Domain specific stats
-    text: GenerativeTextMetricsSummary = Field(
-        description="Text-specific metrics for tokens, words, and characters"
-    )
-    image: GenerativeImageMetricsSummary = Field(
-        description="Image-specific metrics for tokens, images, pixels, and bytes"
-    )
-    video: GenerativeVideoMetricsSummary = Field(
-        description="Video-specific metrics for tokens, frames, duration, and bytes"
-    )
-    audio: GenerativeAudioMetricsSummary = Field(
-        description="Audio-specific metrics for tokens, samples, duration, and bytes"
-    )
-
-    @classmethod
-    def update_estimate(
-        cls,
-        state: EstimatedBenchmarkState,
-        response: GenerationResponse | None,
-        request: GenerationRequest,
-        request_info: RequestInfo,
-        scheduler_state: SchedulerState,
-    ):
-        """
-        Update real-time generative metrics estimates with new request data.
-
-        :param state: Current estimated benchmark state to update
-        :param response: Response received from the backend
-        :param request: Original request sent to the backend
-        :param request_info: Metadata about the request execution
-        :param scheduler_state: Current state of the scheduler
-        """
-        benchmark_start_time = scheduler_state.start_time
-        request_start_time = (
-            request_info.timings.request_start or request_info.timings.resolve_start
-        )
-        request_end_time = (
-            request_info.timings.request_end or request_info.timings.resolve_end
-        )
-        event_occurence_time = (
-            request_info.timings.queued
-            if request_info.status == "queued"
-            else (
-                request_info.timings.dequeued
-                if request_info.status == "pending"
-                else request_start_time
-                if request_info.status == "in_progress"
-                else request_end_time
-            )
-        )
-        benchmark_duration = (
-            event_occurence_time - benchmark_start_time
-            if event_occurence_time
-            else None
-        )
-        request_duration = (
-            (request_end_time - request_start_time)
-            if request_end_time and request_start_time
-            else None
-        )
-
-        # Always track concurrency
-        if event_occurence_time is not None:
-            state.add_time_averaged_metric(
-                group=EstimatedBenchmarkState.benchmark_metrics_group,
-                key="concurrency_requests",
-                value=scheduler_state.processing_requests,
-                recorded_time=event_occurence_time,
-            )
-
-        if request_info.status not in {"completed", "errored", "cancelled"}:
-            return
-
-        state.set_metric(
-            group=EstimatedBenchmarkState.benchmark_metrics_group,
-            key="updated",
-            value=True,
-        )
-
-        for prefix in (request_info.status, "total"):
-            requests_count = (
-                scheduler_state.successful_requests
-                if prefix == "completed"
-                else scheduler_state.errored_requests
-                if prefix == "errored"
-                else scheduler_state.cancelled_requests
-                if prefix == "cancelled"
-                else scheduler_state.processed_requests
-            )
-            input_tokens = (
-                (response.input_metrics.total_tokens if response else None)
-                or request.input_metrics.total_tokens
-                or 0
-            )
-            output_tokens = (
-                (response.output_metrics.total_tokens if response else None)
-                or request.output_metrics.total_tokens
-                or 0
-            )
-
-            # Request distribution stats
-            state.set_metric(
-                group=EstimatedBenchmarkState.benchmark_metrics_group,
-                key=f"{prefix}_requests",
-                value=requests_count,
-            )
-            state.set_metric(
-                group=EstimatedBenchmarkState.benchmark_metrics_group,
-                key=f"{prefix}_requests_per_second",
-                value=(
-                    requests_count / benchmark_duration if benchmark_duration else None
-                ),
-            )
-            state.add_avg_metric(
-                group=EstimatedBenchmarkState.benchmark_metrics_group,
-                key=f"{prefix}_request_latency",
-                value=request_duration,
-            )
-            state.add_avg_metric(
-                group=EstimatedBenchmarkState.benchmark_metrics_group,
-                key=f"{prefix}_request_streaming_iterations",
-                value=request_info.timings.iterations or 0,
-            )
-
-            # Token iteration stats
-            state.add_avg_metric(
-                group=EstimatedBenchmarkState.benchmark_metrics_group,
-                key="output_tokens_iterations",
-                value=output_tokens,
-                count=request_info.timings.iterations or 1,
-            )
-            state.add_avg_metric(
-                group=EstimatedBenchmarkState.benchmark_metrics_group,
-                key="output_tokens_wo_first_iterations",
-                value=output_tokens - 1 if output_tokens > 1 else 0,
-                count=request_info.timings.iterations or 1,
-            )
-
-            # Token metrics stats
-            state.add_avg_metric(
-                group=EstimatedBenchmarkState.benchmark_metrics_group,
-                key=f"{prefix}_time_to_first_token",
-                value=request_info.timings.first_iteration,
-                start_val=request_start_time,
-            )
-            state.add_avg_metric(
-                group=EstimatedBenchmarkState.benchmark_metrics_group,
-                key=f"{prefix}_inter_token_latency",
-                value=request_info.timings.last_iteration,
-                start_val=request_info.timings.first_iteration,
-                count=(output_tokens or 1) - 1,
-            )
-            state.add_avg_metric(
-                group=EstimatedBenchmarkState.benchmark_metrics_group,
-                key=f"{prefix}_time_per_output_token",
-                value=request_duration,
-                count=output_tokens or 0,
-            )
-
-            # Input/output throughput stats
-            if event_occurence_time is not None:
-                state.add_avg_rate_metric(
-                    group=EstimatedBenchmarkState.benchmark_metrics_group,
-                    key="input_tokens",
-                    value=input_tokens,
-                    start_time=benchmark_start_time,
-                    end_time=event_occurence_time,
-                )
-                state.add_avg_rate_metric(
-                    group=EstimatedBenchmarkState.benchmark_metrics_group,
-                    key="output_tokens",
-                    value=output_tokens,
-                    start_time=benchmark_start_time,
-                    end_time=event_occurence_time,
-                )
-                state.add_avg_rate_metric(
-                    group=EstimatedBenchmarkState.benchmark_metrics_group,
-                    key="total_tokens",
-                    value=input_tokens + output_tokens,
-                    start_time=benchmark_start_time,
-                    end_time=event_occurence_time,
-                )
-                state.add_avg_rate_metric(
-                    group=EstimatedBenchmarkState.benchmark_metrics_group,
-                    key="input_text_tokens",
-                    value=(
-                        (response.input_metrics.text_tokens if response else None)
-                        or request.input_metrics.text_tokens
-                        or 0
-                    ),
-                    start_time=benchmark_start_time,
-                    end_time=event_occurence_time,
-                )
-                state.add_avg_rate_metric(
-                    group=EstimatedBenchmarkState.benchmark_metrics_group,
-                    key="input_images",
-                    value=(
-                        (response.input_metrics.image_count if response else None)
-                        or request.input_metrics.image_count
-                        or 0
-                    ),
-                    start_time=benchmark_start_time,
-                    end_time=event_occurence_time,
-                )
-                state.add_avg_rate_metric(
-                    group=EstimatedBenchmarkState.benchmark_metrics_group,
-                    key="input_video_frames",
-                    value=(
-                        (response.input_metrics.video_frames if response else None)
-                        or request.input_metrics.video_frames
-                        or 0
-                    ),
-                    start_time=benchmark_start_time,
-                    end_time=event_occurence_time,
-                )
-                state.add_avg_rate_metric(
-                    group=EstimatedBenchmarkState.benchmark_metrics_group,
-                    key="input_audio_seconds",
-                    value=request.input_metrics.audio_seconds or 0,
-                    start_time=benchmark_start_time,
-                    end_time=event_occurence_time,
-                )
-
-    @classmethod
-    def compile(
-        cls,
-        completed: list[GenerativeRequestStats],
-        errored: list[GenerativeRequestStats],
-        incomplete: list[GenerativeRequestStats],
-    ) -> GenerativeMetrics:
-        """
-        Compile final generative metrics from request statistics.
-
-        :param completed: Successfully completed request statistics
-        :param errored: Failed request statistics
-        :param incomplete: Incomplete/cancelled request statistics
-        :return: Compiled generative metrics with full distributions
-        """
-        requests = completed + errored + incomplete
-        request_types = cast(
-            "list[Literal['successful', 'error', 'incomplete']]",
-            ["successful"] * len(completed)
-            + ["error"] * len(errored)
-            + ["incomplete"] * len(incomplete),
-        )
-        request_times = [
-            (
-                req.info.timings.request_start or req.info.timings.resolve_start or 0,
-                req.info.timings.request_end or req.info.timings.resolve_end or 0,
-            )
-            for req in requests
-        ]
-        input_metrics = [req.input_metrics for req in requests]
-        output_metrics = [req.output_metrics for req in requests]
-
-        return GenerativeMetrics(
-            # Request stats
-            requests_per_second=StatusDistributionSummary.from_request_times(
-                request_types=request_types,
-                requests=request_times,
-                distribution_type="rate",
-            ),
-            request_concurrency=StatusDistributionSummary.from_request_times(
-                request_types=request_types,
-                requests=request_times,
-                distribution_type="concurrency",
-            ),
-            request_latency=StatusDistributionSummary.from_values(
-                value_types=request_types,
-                values=[req.request_latency or 0.0 for req in requests],
-            ),
-            request_streaming_iterations_count=StatusDistributionSummary.from_values(
-                value_types=request_types,
-                values=[float(req.info.timings.iterations or 0) for req in requests],
-            ),
-            # General token stats
-            prompt_token_count=StatusDistributionSummary.from_values(
-                value_types=request_types,
-                values=[float(req.prompt_tokens or 0) for req in requests],
-            ),
-            output_token_count=StatusDistributionSummary.from_values(
-                value_types=request_types,
-                values=[float(req.output_tokens or 0) for req in requests],
-            ),
-            total_token_count=StatusDistributionSummary.from_values(
-                value_types=request_types,
-                values=[float(req.total_tokens or 0) for req in requests],
-            ),
-            time_to_first_token_ms=StatusDistributionSummary.from_values(
-                value_types=request_types,
-                values=[req.time_to_first_token_ms or 0.0 for req in requests],
-            ),
-            time_per_output_token_ms=StatusDistributionSummary.from_values(
-                value_types=request_types,
-                values=[req.time_per_output_token_ms or 0.0 for req in requests],
-            ),
-            inter_token_latency_ms=StatusDistributionSummary.from_values(
-                value_types=request_types,
-                values=[req.inter_token_latency_ms or 0.0 for req in requests],
-            ),
-            output_tokens_wo_first_per_iteration=StatusDistributionSummary.from_values(
-                value_types=request_types,
-                values=[
-                    max(0.0, (req.output_metrics.total_tokens or 1.0) - 1.0)
-                    for req in requests
-                ],
-                weights=[req.info.timings.iterations or 1 for req in requests],
-            ),
-            output_tokens_per_second=StatusDistributionSummary.from_values(
-                value_types=request_types,
-                values=[req.output_tokens_per_second or 0.0 for req in requests],
-            ),
-            output_tokens_per_iteration=StatusDistributionSummary.from_values(
-                value_types=request_types,
-                values=[req.output_tokens_per_iteration or 0.0 for req in requests],
-                weights=[req.info.timings.iterations or 1 for req in requests],
-            ),
-            tokens_per_second=StatusDistributionSummary.from_values(
-                value_types=request_types,
-                values=[req.tokens_per_second or 0.0 for req in requests],
-            ),
-            # Domain-specific stats
-            text=GenerativeTextMetricsSummary.compile(
-                request_types=request_types,
-                request_times=request_times,
-                input_metrics=input_metrics,
-                output_metrics=output_metrics,
-            ),
-            image=GenerativeImageMetricsSummary.compile(
-                request_types=request_types,
-                request_times=request_times,
-                input_metrics=input_metrics,
-                output_metrics=output_metrics,
-            ),
-            video=GenerativeVideoMetricsSummary.compile(
-                request_types=request_types,
-                request_times=request_times,
-                input_metrics=input_metrics,
-                output_metrics=output_metrics,
-            ),
-            audio=GenerativeAudioMetricsSummary.compile(
-                request_types=request_types,
-                request_times=request_times,
-                input_metrics=input_metrics,
-                output_metrics=output_metrics,
-            ),
-        )
-
-
-class SchedulerDict(StandardBaseDict):
-    """Scheduler configuration and execution state dictionary."""
-
-    strategy: SchedulingStrategy = Field(
-        description="Scheduling strategy used for request distribution"
-    )
-    constraints: dict[str, dict[str, Any]] = Field(
-        description="Execution constraints applied during benchmarking"
-    )
-    state: SchedulerState = Field(
-        description="Final state of the scheduler after execution"
-    )
-
-
-class BenchmarkerDict(StandardBaseDict):
-    """Benchmarker configuration and component settings dictionary."""
-
-    profile: Profile = Field(description="Benchmark profile configuration")
-    requests: dict[str, Any] = Field(
-        description="Request configuration and dataset information"
-    )
-    backend: dict[str, Any] = Field(
-        description="Backend configuration and connection details"
-    )
-    environment: dict[str, Any] = Field(
-        description="Execution environment configuration"
-    )
-
-
-class GenerativeBenchmark(Benchmark, StandardBaseDict):
-    """Complete generative AI benchmark results with specialized metrics."""
-
-    group_name: ClassVar[Literal["generative_benchmark"]] = "generative_benchmark"
-
-    type_: Literal["generative_benchmark"] = "generative_benchmark"  # type: ignore[assignment]
-    id_: str = Field(
-        default_factory=lambda: str(uuid.uuid4()),
-        description="Unique identifier for this benchmark execution",
-    )
-    run_id: str = Field(
-        description="Identifier for the benchmarker run containing this benchmark"
-    )
-    run_index: int = Field(
-        description="Sequential index of this benchmark within the benchmarker run"
-    )
-    scheduler: SchedulerDict = Field(
-        description="Scheduler configuration and execution state"
-    )
-    benchmarker: BenchmarkerDict = Field(
-        description="Benchmarker configuration and component settings"
-    )
-    run_stats: BenchmarkSchedulerStats = Field(
-        description="Scheduler timing and performance statistics"
-    )
-    start_time: float = Field(
-        default=-1.0, description="Unix timestamp when the first request was initiated"
-    )
-    end_time: float = Field(
-        default=-1.0, description="Unix timestamp when the last request completed"
-    )
-
-    def get_run_metrics_sample(
-        self,
-    ) -> dict[Literal["start_time", "end_time", "duration"], float]:
-        return {
-            "start_time": self.start_time,
-            "end_time": self.end_time,
-            "duration": self.duration,
-        }
-
-    def get_request_metrics_sample(
-        self,
-    ) -> dict[
-        Literal[
-            "request_count",
-            "request_latency",
-            "request_throughput",
-            "request_concurrency",
-        ],
-        float,
-    ]:
-        return {
-            "request_count": self.request_totals.successful,
-            "request_latency": self.metrics.request_latency.successful.mean,
-            "request_throughput": self.metrics.requests_per_second.successful.mean,
-            "request_concurrency": self.metrics.request_concurrency.successful.mean,
-        }
-
-    @computed_field  # type: ignore[misc]
-    @property
-    def duration(self) -> float:
-        """
-        Benchmark execution duration in seconds.
-
-        :return: Time elapsed from first request start to last request completion.
-        """
-        return self.end_time - self.start_time
-
-    metrics: GenerativeMetrics = Field(
-        description="Performance metrics and statistical distributions"
-    )
-    request_totals: StatusBreakdown[int, int, int, int] = Field(
-        description="Request counts by status: successful, incomplete, errored, total"
-    )
-    requests: StatusBreakdown[
-        list[GenerativeRequestStats],
-        list[GenerativeRequestStats],
-        list[GenerativeRequestStats],
-        None,
-    ] = Field(
-        description="Request details grouped by status: successful, incomplete, errored"
-    )
-
-    @classmethod
-    def update_estimate(
-        cls,
-        args: BenchmarkerArgs,
-        state: EstimatedBenchmarkState,
-        response: GenerationResponse | None,
-        request: GenerationRequest,
-        request_info: RequestInfo,
-        scheduler_state: SchedulerState,
-    ):
-        """
-        Update generative benchmark estimates with new request data.
-
-        Handles warmup/cooldown filtering, request sampling via reservoir sampling,
-        and delegates metric updates to child metric classes.
-
-        :param args: Benchmark configuration arguments
-        :param state: Current estimated benchmark state to update
-        :param response: Response received from the backend
-        :param request: Original request sent to the backend
-        :param request_info: Metadata about the request execution
-        :param scheduler_state: Current state of the scheduler
-        """
-        if (
-            request_info.status == "cancelled"
-            and request_info.timings.resolve_start is None
-        ):
-            # Cancelled requests that never started should be ignored
-            return
-
-        # Update child metric groups
-        BenchmarkSchedulerStats.update_estimate(state, request_info)
-        GenerativeMetrics.update_estimate(
-            state, response, request, request_info, scheduler_state
-        )
-
-        # Store requests and sampling info, update counts
-        if "requests_completed" not in state:
-            state["requests_completed"] = []
-            state["samples_completed"] = []
-            state["requests_errored"] = []
-            state["samples_errored"] = []
-            state["requests_incomplete"] = []
-            state["samples_incomplete"] = []
-        in_warmup = state.set_metric(
-            group=EstimatedBenchmarkState.benchmark_state_group,
-            key="in_warmup",
-            value=args.is_in_warmup(request_info, scheduler_state),
-        )
-        in_cooldown = state.set_metric(
-            group=EstimatedBenchmarkState.benchmark_state_group,
-            key="in_cooldown",
-            value=args.is_in_cooldown(request_info, scheduler_state),
-        )
-        state[f"{EstimatedBenchmarkState.benchmark_state_group}_status"] = (
-            "in_cooldown"
-            if in_cooldown
-            else "in_warmup"
-            if in_warmup
-            else "in_progress"
-        )
-
-        if (
-            request_info.status not in {"completed", "errored", "cancelled"}
-            or in_warmup
-            or in_cooldown
-        ):
-            # Must be fully resolved to be added
-            return
-
-        state.set_metric(
-            group=EstimatedBenchmarkState.benchmark_state_group,
-            key="updated",
-            value=True,
-        )
-
-        if response is None:
-            response = GenerationResponse(
-                request_id=request.request_id, request_args=str(request.arguments)
-            )
-
-        stats = response.compile_stats(
-            request, request_info, args.prefer_response_metrics
-        )
-
-        # Determine status and get corresponding lists
-        if request_info.status == "completed":
-            requests_list = state["requests_completed"]
-            samples_list = state["samples_completed"]
-        elif request_info.status == "errored":
-            requests_list = state["requests_errored"]
-            samples_list = state["samples_errored"]
-        else:  # cancelled (incomplete)
-            requests_list = state["requests_incomplete"]
-            samples_list = state["samples_incomplete"]
-
-        # Add to requests list
-        requests_list.append(stats)
-        current_index = len(requests_list) - 1
-
-        # Handle request sampling logic
-        if args.sample_requests is None:
-            # No sampling, add index to samples list
-            samples_list.append(current_index)
-        elif args.sample_requests > 0 and len(samples_list) < args.sample_requests:
-            # Space in samples list, add index
-            samples_list.append(current_index)
-        elif (
-            args.sample_requests > 0
-            and (replace_index := random.randrange(len(requests_list)))
-            < args.sample_requests
-        ):
-            # No space, adding based on reservoir sampling
-            samples_list[replace_index] = current_index
-        # Sampling set to 0, don't keep any requests
-
-    @classmethod
-    def compile(
-        cls,
-        args: BenchmarkerArgs,
-        estimated_state: EstimatedBenchmarkState,
-        scheduler_state: SchedulerState,
-        profile: Profile,
-        requests: Iterable,  # noqa: ARG003
-        backend: BackendInterface,
-        environment: Environment,
-        strategy: SchedulingStrategy,
-        constraints: dict[str, dict[str, Any]],
-        data: list[Any],
-    ) -> GenerativeBenchmark:
-        """
-        Compile final generative benchmark from accumulated state.
-
-        :param args: Benchmark configuration arguments
-        :param estimated_state: Accumulated benchmark state from execution
-        :param scheduler_state: Final state of the scheduler
-        :param profile: Benchmark profile configuration
-        :param requests: Collection of requests executed
-        :param backend: Backend interface used for execution
-        :param environment: Execution environment configuration
-        :param strategy: Scheduling strategy used
-        :param constraints: Execution constraints applied
-        :return: Compiled generative benchmark instance
-        """
-        return GenerativeBenchmark(
-            run_id=args.run_id,
-            run_index=args.run_index,
-            scheduler=SchedulerDict(
-                strategy=strategy,
-                constraints={
-                    key: InfoMixin.extract_from_obj(val)
-                    for key, val in constraints.items()
-                },
-                state=scheduler_state,
-            ),
-            benchmarker=BenchmarkerDict(
-                profile=profile,
-                requests={"data": data},
-                backend=backend.info,
-                environment=environment.info,
-            ),
-            run_stats=BenchmarkSchedulerStats.compile(estimated_state, scheduler_state),
-            start_time=scheduler_state.start_time or -1.0,
-            end_time=scheduler_state.end_time or -1.0,
-            metrics=GenerativeMetrics.compile(
-                completed=estimated_state.get("requests_completed", []),
-                errored=estimated_state.get("requests_errored", []),
-                incomplete=estimated_state.get("requests_incomplete", []),
-            ),
-            request_totals=StatusBreakdown[int, int, int, int](
-                successful=len(estimated_state.get("requests_completed", [])),
-                incomplete=len(estimated_state.get("requests_incomplete", [])),
-                errored=len(estimated_state.get("requests_errored", [])),
-                total=(
-                    len(estimated_state.get("requests_completed", []))
-                    + len(estimated_state.get("requests_incomplete", []))
-                    + len(estimated_state.get("requests_errored", []))
-                ),
-            ),
-            requests=StatusBreakdown[
-                list[GenerativeRequestStats],
-                list[GenerativeRequestStats],
-                list[GenerativeRequestStats],
-                None,
-            ](
-                successful=estimated_state.get("requests_completed", []),
-                incomplete=estimated_state.get("requests_incomplete", []),
-                errored=estimated_state.get("requests_errored", []),
-                total=None,
-            ),
-        )
-
-
-class BenchmarkGenerativeTextArgs(StandardBaseModel):
-    """
-    Configuration arguments for generative text benchmark execution.
-
-    Defines all parameters for benchmark setup including target endpoint, data
-    sources, backend configuration, processing pipeline, output formatting, and
-    execution constraints. Supports loading from scenario files and merging with
-    runtime overrides.
-    """
-
-    @classmethod
-    def create(
-        cls, scenario: Path | str | None, **kwargs: dict[str, Any]
-    ) -> BenchmarkGenerativeTextArgs:
-        """
-        Create benchmark args from scenario file and/or keyword arguments.
-
-        :param scenario: Path to scenario file or name of built-in scenario
-        :param kwargs: Additional keyword arguments to override scenario values
-        :return: Configured benchmark args instance
-        :raises ValueError: If scenario is not found or file format is unsupported
-        """
-        constructor_kwargs = {}
-
-        if scenario is not None:
-            if isinstance(scenario, str) and scenario in (
-                builtin_scenarios := get_builtin_scenarios()
-            ):
-                scenario_path = builtin_scenarios[scenario]
-            elif Path(scenario).exists() and Path(scenario).is_file():
-                scenario_path = Path(scenario)
-            else:
-                raise ValueError(f"Scenario '{scenario}' not found.")
-
-            with scenario_path.open() as file:
-                if scenario_path.suffix == ".json":
-                    scenario_data = json.load(file)
-                elif scenario_path.suffix in {".yaml", ".yml"}:
-                    scenario_data = yaml.safe_load(file)
-                else:
-                    raise ValueError(
-                        f"Unsupported scenario file format: {scenario_path.suffix}"
-                    )
-            if "args" in scenario_data:
-                # loading from a report file
-                scenario_data = scenario_data["args"]
-            constructor_kwargs.update(scenario_data)
-
-        # Apply overrides from kwargs
-        constructor_kwargs.update(kwargs)
-
-        return cls.model_validate(constructor_kwargs)
-
-    @classmethod
-    def get_default(cls: type[BenchmarkGenerativeTextArgs], field: str) -> Any:
-        """
-        Get default value for a model field.
-
-        :param field: Name of the field to retrieve default for
-        :return: Default value for the specified field
-        :raises ValueError: If field is not found in model
-        """
-        if field not in BenchmarkGenerativeTextArgs.model_fields:
-            raise ValueError(
-                f"Field '{field}' not found in BenchmarkGenerativeTextArgs"
-            )
-
-        field_info = BenchmarkGenerativeTextArgs.model_fields[field]
-        factory = field_info.default_factory
-
-        if factory is None:
-            return field_info.default
-
-        if len(inspect.signature(factory).parameters) == 0:
-            return factory()  # type: ignore[call-arg] # Confirmed correct at runtime by code above
-        else:
-            return factory({})  # type: ignore[call-arg] # Confirmed correct at runtime by code above
-
-    model_config = ConfigDict(
-        extra="ignore",
-        use_enum_values=True,
-        from_attributes=True,
-        arbitrary_types_allowed=True,
-        validate_by_alias=True,
-        validate_by_name=True,
-        alias_generator=AliasGenerator(
-            # Support field names with hyphens
-            validation_alias=lambda field_name: AliasChoices(
-                field_name, field_name.replace("_", "-")
-            ),
-        ),
-    )
-
-    # Required
-    target: str = Field(description="Target endpoint URL for benchmark execution")
-    data: list[Any] = Field(
-        description="List of dataset sources or data files",
-        default_factory=list,
-        min_length=1,
-    )
-    # Benchmark configuration
-    profile: StrategyType | ProfileType | Profile = Field(
-        default="sweep", description="Benchmark profile or scheduling strategy type"
-    )
-    rate: list[float] | None = Field(
-        default=None, description="Request rate(s) for rate-based scheduling"
-    )
-    # Backend configuration
-    backend: BackendType | Backend = Field(
-        default="openai_http", description="Backend type or instance for execution"
-    )
-    backend_kwargs: dict[str, Any] | None = Field(
-        default=None, description="Additional backend configuration arguments"
-    )
-    model: str | None = Field(default=None, description="Model identifier for backend")
-    # Data configuration
-    processor: str | Path | PreTrainedTokenizerBase | None = Field(
-        default=None, description="Tokenizer path, name, or instance for processing"
-    )
-    processor_args: dict[str, Any] | None = Field(
-        default=None, description="Additional tokenizer configuration arguments"
-    )
-    data_args: list[dict[str, Any]] | None = Field(
-        default_factory=list, description="Per-dataset configuration arguments"
-    )
-    data_samples: int = Field(
-        default=-1, description="Number of samples to use from datasets (-1 for all)"
-    )
-    data_column_mapper: (
-        DatasetPreprocessor | dict[str, str] | Literal["generative_column_mapper"]
-    ) = Field(
-        default="generative_column_mapper",
-        description="Column mapping preprocessor for dataset fields",
-    )
-    data_request_formatter: DatasetPreprocessor | dict[str, str] | str = Field(
-        default="chat_completions",
-        description="Request formatting preprocessor or template name",
-        validation_alias=AliasChoices(
-            "data_request_formatter",
-            "data-request-formatter",
-            "request_type",
-            "request-type",
-        ),
-    )
-    data_collator: Callable | Literal["generative"] | None = Field(
-        default="generative", description="Data collator for batch processing"
-    )
-    data_sampler: Sampler[int] | Literal["shuffle"] | None = Field(
-        default=None, description="Data sampler for request ordering"
-    )
-    data_num_workers: int | None = Field(
-        default=None, description="Number of workers for data loading"
-    )
-    dataloader_kwargs: dict[str, Any] | None = Field(
-        default=None, description="Additional dataloader configuration arguments"
-    )
-    random_seed: int = Field(default=42, description="Random seed for reproducibility")
-    # Output configuration
-    output_path: str | Path | None = Field(
-        default_factory=Path.cwd, description="Directory path for output files"
-    )
-    output_formats: list[str] | dict[str, str | dict[str, Any]] | None = Field(
-        default_factory=lambda: ["console", "json"],
-        description="Output format names or configuration mappings",
-    )
-    # Benchmarker configuration
-    benchmark_cls: type[GenerativeBenchmark] = Field(
-        default=GenerativeBenchmark,
-        description="Benchmark class to use for result compilation",
-    )
-    sample_requests: int | None = Field(
-        default=10,
-        description="Number of requests to sample for detailed metrics (None for all)",
-    )
-    warmup: float | None = Field(
-        default=None,
-        description="Warmup period in seconds, requests, or fraction (0-1)",
-    )
-    cooldown: float | None = Field(
-        default=None,
-        description="Cooldown period in seconds, requests, or fraction (0-1)",
-    )
-    prefer_response_metrics: bool = Field(
-        default=True,
-        description="Whether to prefer backend response metrics over request metrics",
-    )
-    # Constraints configuration
-    max_seconds: int | float | None = Field(
-        default=None, description="Maximum benchmark execution time in seconds"
-    )
-    max_requests: int | None = Field(
-        default=None, description="Maximum number of requests to execute"
-    )
-    max_errors: int | None = Field(
-        default=None, description="Maximum number of errors before stopping"
-    )
-    max_error_rate: float | None = Field(
-        default=None, description="Maximum error rate (0-1) before stopping"
-    )
-    max_global_error_rate: float | None = Field(
-        default=None, description="Maximum global error rate (0-1) before stopping"
-    )
-
-    @field_validator("data", "data_args", "rate", mode="wrap")
-    @classmethod
-    def single_to_list(
-        cls, value: Any, handler: ValidatorFunctionWrapHandler
-    ) -> list[Any]:
-        """
-        Ensures field is always a list.
-
-        :param value: Input value for the 'data' field
-        :return: List of data sources
-        """
-        try:
-            return handler(value)
-        except ValidationError as err:
-            # If validation fails, try wrapping the value in a list
-            if err.errors()[0]["type"] == "list_type":
-                return handler([value])
-            else:
-                raise
-
-    @model_serializer
-    def serialize_model(self):
-        """
-        Custom serialization logic for benchmark args.
-
-        Converts complex types to serializable formats including Profile to type
-        string, Backend to type string, and Path objects to strings.
-
-        :return: Dictionary representation suitable for JSON/YAML serialization
-        """
-        return {
-            # target - serialize as is
-            "target": self.target,
-            "data": [
-                item if isinstance(item, str | type(None)) else str(item)
-                for item in self.data
-            ],  # data - for each item in the list, if not a str or None, save str(item)
-            "profile": (
-                self.profile.type_
-                if isinstance(self.profile, Profile)
-                else self.profile
-            ),  # profile - if instance of Profile, then save as profile.type_
-            "rate": self.rate,
-            "backend": (
-                self.backend.type_
-                if isinstance(self.backend, Backend)
-                else self.backend
-            ),  # backend - if instance of Backend, then save as backend.type_
-            "backend_kwargs": self.backend_kwargs,
-            "model": self.model,
-            "processor": (
-                self.processor
-                if isinstance(self.processor, str)
-                else str(self.processor)
-                if self.processor is not None
-                else None
-            ),  # processor - if not str, then save as str(processor)
-            "processor_args": self.processor_args,
-            "data_args": self.data_args,
-            "data_samples": self.data_samples,
-            "data_column_mapper": (
-                self.data_column_mapper
-                if isinstance(self.data_column_mapper, dict | str)
-                else {}
-            ),  # data_column_mapper - if not dict or str, then save as an empty dict
-            "data_request_formatter": (
-                self.data_request_formatter
-                if isinstance(self.data_request_formatter, dict | str)
-                else {}
-            ),  # data_request_formatter - if not dict or str, then save as empty dict
-            "data_collator": (
-                self.data_collator if isinstance(self.data_collator, str) else None
-            ),  # data_collator - if not str, then save as None
-            "data_sampler": (
-                self.data_sampler if isinstance(self.data_sampler, str) else None
-            ),  # data_sampler - if not str, then save as None
-            "data_num_workers": self.data_num_workers,
-            "dataloader_kwargs": self.dataloader_kwargs,
-            "random_seed": self.random_seed,
-            "output_path": (
-                str(self.output_path) if self.output_path is not None else None
-            ),  # output_path - if not None, then ensure it's a str
-            "output_formats": self.output_formats,
-            # benchmark_cls - don't save at all (excluded)
-            "sample_requests": self.sample_requests,
-            "warmup": self.warmup,
-            "cooldown": self.cooldown,
-            "prefer_response_metrics": self.prefer_response_metrics,
-            "max_seconds": self.max_seconds,
-            "max_requests": self.max_requests,
-            "max_errors": self.max_errors,
-            "max_error_rate": self.max_error_rate,
-            "max_global_error_rate": self.max_global_error_rate,
-        }
-
-
-class GenerativeBenchmarksReport(StandardBaseModel):
-    """Container for multiple benchmark results with load/save functionality."""
-
-    DEFAULT_FILE: ClassVar[str] = "benchmarks.json"
-
-    @staticmethod
-    def load_file(
-        path: str | Path, type_: Literal["json", "yaml"] | None = None
-    ) -> GenerativeBenchmarksReport:
-        """
-        Load a report from a file.
-
-        :param path: The path to load the report from.
-        :param type_: File type override, auto-detected from extension if None.
-        :return: The loaded report.
-        :raises ValueError: If file type is unsupported.
-        """
-        path = Path(path) if not isinstance(path, Path) else path
-
-        if path.is_dir():
-            path = path / GenerativeBenchmarksReport.DEFAULT_FILE
-
-        path.parent.mkdir(parents=True, exist_ok=True)
-        path_suffix = path.suffix.lower()[1:]
-
-        with path.open("r") as file:
-            if (type_ or path_suffix) == "json":
-                model_dict = json.loads(file.read())
-            elif (type_ or path_suffix) in ["yaml", "yml"]:
-                model_dict = yaml.safe_load(file)
-            else:
-                raise ValueError(f"Unsupported file type: {type_} for {path}.")
-
-        return GenerativeBenchmarksReport.model_validate(model_dict)
-
-    args: BenchmarkGenerativeTextArgs = Field(
-        description="The benchmark arguments used for all benchmarks in the report."
-    )
-    benchmarks: list[GenerativeBenchmark] = Field(
-        description="The list of completed benchmarks contained within the report.",
-        default_factory=list,
-    )
-
-    def save_file(
-        self, path: str | Path | None, type_: Literal["json", "yaml"] | None = None
-    ) -> Path:
-        """
-        Save the report to a file.
-
-        :param path: The path to save the report to.
-        :param type_: File type override, auto-detected from extension if None.
-        :return: The path to the saved report.
-        :raises ValueError: If file type is unsupported.
-        """
-        if path is None:
-            path = Path.cwd()
-        elif not isinstance(path, Path):
-            path = Path(path)
-
-        if path.is_dir():
-            path = path / GenerativeBenchmarksReport.DEFAULT_FILE
-
-        path.parent.mkdir(parents=True, exist_ok=True)
-        path_suffix = path.suffix.lower()[1:]
-        model_dict = self.model_dump()
-
-        if (type_ or path_suffix) == "json":
-            save_str = json.dumps(model_dict)
-        elif (type_ or path_suffix) in ["yaml", "yml"]:
-            save_str = yaml.dump(model_dict)
-        else:
-            raise ValueError(f"Unsupported file type: {type_} for {path}.")
-
-        with path.open("w") as file:
-            file.write(save_str)
-
-        return path
diff --git a/src/guidellm/benchmark/schemas/__init__.py b/src/guidellm/benchmark/schemas/__init__.py
new file mode 100644
index 00000000..fd0f5016
--- /dev/null
+++ b/src/guidellm/benchmark/schemas/__init__.py
@@ -0,0 +1,64 @@
+"""
+Benchmark schemas for performance measurement and result analysis.
+
+This module consolidates the complete benchmark schema ecosystem, providing both
+base abstractions for benchmark execution and domain-specific implementations
+for generative AI tasks. It exports core configuration objects, accumulator
+interfaces for real-time metric collection, benchmark result containers with
+statistical summaries, and reporting utilities. The schemas support flexible
+scheduling strategies, comprehensive metric tracking including latency and
+throughput distributions, and multi-modal generative benchmarks for text, image,
+video, and audio generation tasks.
+"""
+
+from __future__ import annotations
+
+from .base import (
+    Benchmark,
+    BenchmarkAccumulator,
+    BenchmarkAccumulatorT,
+    BenchmarkConfig,
+    BenchmarkT,
+)
+from .generative import (
+    BenchmarkGenerativeTextArgs,
+    GenerativeAudioMetricsSummary,
+    GenerativeBenchmark,
+    GenerativeBenchmarkAccumulator,
+    GenerativeBenchmarksReport,
+    GenerativeBenchmarkTimings,
+    GenerativeImageMetricsSummary,
+    GenerativeMetrics,
+    GenerativeMetricsAccumulator,
+    GenerativeMetricsSummary,
+    GenerativeRequestsAccumulator,
+    GenerativeTextMetricsSummary,
+    GenerativeVideoMetricsSummary,
+    RunningMetricStats,
+    SchedulerMetrics,
+    SchedulerMetricsAccumulator,
+)
+
+__all__ = [
+    "Benchmark",
+    "BenchmarkAccumulator",
+    "BenchmarkAccumulatorT",
+    "BenchmarkConfig",
+    "BenchmarkGenerativeTextArgs",
+    "BenchmarkT",
+    "GenerativeAudioMetricsSummary",
+    "GenerativeBenchmark",
+    "GenerativeBenchmarkAccumulator",
+    "GenerativeBenchmarkTimings",
+    "GenerativeBenchmarksReport",
+    "GenerativeImageMetricsSummary",
+    "GenerativeMetrics",
+    "GenerativeMetricsAccumulator",
+    "GenerativeMetricsSummary",
+    "GenerativeRequestsAccumulator",
+    "GenerativeTextMetricsSummary",
+    "GenerativeVideoMetricsSummary",
+    "RunningMetricStats",
+    "SchedulerMetrics",
+    "SchedulerMetricsAccumulator",
+]
diff --git a/src/guidellm/benchmark/schemas/base.py b/src/guidellm/benchmark/schemas/base.py
new file mode 100644
index 00000000..91e2fa95
--- /dev/null
+++ b/src/guidellm/benchmark/schemas/base.py
@@ -0,0 +1,190 @@
+"""
+Core benchmark schemas for performance measurement and result analysis.
+
+Provides base classes and configuration for benchmark execution, including
+accumulation of metrics during scheduler runs and compilation of final results.
+Supports configurable scheduling strategies with comprehensive metric collection
+for latency, throughput, and concurrency analysis.
+"""
+
+from __future__ import annotations
+
+import uuid
+from abc import ABC, abstractmethod
+from typing import Any, Generic, TypeVar
+
+from pydantic import Field
+
+from guidellm.benchmark.profile import Profile
+from guidellm.scheduler import (
+    MultiTurnRequestT,
+    RequestT,
+    ResponseT,
+    SchedulerState,
+    SchedulingStrategy,
+)
+from guidellm.schemas import RequestInfo, StandardBaseDict, StatusDistributionSummary
+
+__all__ = [
+    "Benchmark",
+    "BenchmarkAccumulator",
+    "BenchmarkAccumulatorT",
+    "BenchmarkConfig",
+    "BenchmarkT",
+]
+
+BenchmarkAccumulatorT = TypeVar(
+    "BenchmarkAccumulatorT", bound="BenchmarkAccumulator[Any, Any]"
+)
+
+BenchmarkT = TypeVar("BenchmarkT", bound="Benchmark")
+
+
+class BenchmarkConfig(StandardBaseDict):
+    """
+    Configuration parameters for benchmark execution.
+
+    Encapsulates scheduler strategy, request sampling, warmup/cooldown phases,
+    and metric collection preferences for controlled benchmark runs.
+    """
+
+    id_: str = Field(
+        default_factory=lambda: str(uuid.uuid4()),
+        description="Unique identifier for this benchmark execution",
+    )
+    run_id: str = Field(
+        description="Unique identifier for the benchmark run",
+    )
+    run_index: int = Field(
+        description="Sequential index of this run within a benchmark series",
+    )
+    strategy: SchedulingStrategy = Field(
+        description="Scheduling strategy for request execution",
+    )
+    constraints: dict[str, dict[str, Any]] = Field(
+        description="Constraints applied to the scheduling strategy",
+    )
+    sample_requests: int | None = Field(
+        default=20,
+        description="Number of requests to sample for final benchmark metrics",
+    )
+    warmup: int | float | None = Field(
+        default=None,
+        description="Warmup period in seconds before benchmarking starts",
+    )
+    cooldown: int | float | None = Field(
+        default=None,
+        description="Cooldown period in seconds after benchmarking ends",
+    )
+    prefer_response_metrics: bool = Field(
+        default=True,
+        description="Whether to prioritize response metrics over request metrics",
+    )
+    profile: Profile = Field(
+        description="Benchmark profile defining execution parameters",
+    )
+    requests: dict[str, Any] = Field(
+        description="Request configuration and dataset information",
+    )
+    backend: dict[str, Any] = Field(
+        description="Backend configuration and connection details",
+    )
+    environment: dict[str, Any] = Field(
+        description="Execution environment configuration and metadata",
+    )
+
+
+class BenchmarkAccumulator(StandardBaseDict, ABC, Generic[RequestT, ResponseT]):
+    """
+    Accumulates metrics and state during benchmark execution.
+
+    Tracks benchmark progress by updating estimates as requests are processed,
+    enabling incremental metric collection during scheduler runs.
+    """
+
+    config: BenchmarkConfig = Field(
+        description="Configuration parameters for this benchmark execution",
+    )
+
+    @abstractmethod
+    def update_estimate(
+        self,
+        response: ResponseT | None,
+        request: RequestT | MultiTurnRequestT[RequestT],
+        info: RequestInfo,
+        scheduler_state: SchedulerState,
+    ):
+        """
+        Update benchmark estimates with new request/response data.
+
+        :param response: Response from the backend, if available
+        :param request: Request submitted to the backend
+        :param info: Metadata about request execution timing and status
+        :param scheduler_state: Current state of the scheduler
+        """
+        ...
+
+
+class Benchmark(StandardBaseDict, ABC, Generic[BenchmarkAccumulatorT]):
+    """
+    Abstract base class for benchmark result implementations.
+
+    Defines the interface for capturing execution metrics and compiling final results
+    from scheduler-driven workload executions, including request latency, throughput,
+    and concurrency distributions.
+    """
+
+    @property
+    @abstractmethod
+    def start_time(self) -> float:
+        """
+        :return: Benchmark start time in seconds since epoch
+        """
+
+    @property
+    @abstractmethod
+    def end_time(self) -> float:
+        """
+        :return: Benchmark end time in seconds since epoch
+        """
+
+    @property
+    @abstractmethod
+    def duration(self) -> float:
+        """
+        :return: Total benchmark execution duration in seconds
+        """
+
+    @property
+    @abstractmethod
+    def request_latency(self) -> StatusDistributionSummary:
+        """
+        :return: Distribution of request latencies across all processed requests
+        """
+
+    @property
+    @abstractmethod
+    def request_throughput(self) -> StatusDistributionSummary:
+        """
+        :return: Distribution of request throughput across benchmark duration
+        """
+
+    @property
+    @abstractmethod
+    def request_concurrency(self) -> StatusDistributionSummary:
+        """
+        :return: Distribution of concurrent requests across benchmark duration
+        """
+
+    @classmethod
+    @abstractmethod
+    def compile(
+        cls, accumulator: BenchmarkAccumulatorT, scheduler_state: SchedulerState
+    ) -> Any:
+        """
+        Compile final benchmark results from accumulated metrics.
+
+        :param accumulator: Accumulated benchmark state with request statistics
+        :param scheduler_state: Final state of the scheduler after execution
+        :return: Compiled benchmark instance with complete results
+        """
diff --git a/src/guidellm/benchmark/schemas/generative/__init__.py b/src/guidellm/benchmark/schemas/generative/__init__.py
new file mode 100644
index 00000000..ad70fde0
--- /dev/null
+++ b/src/guidellm/benchmark/schemas/generative/__init__.py
@@ -0,0 +1,54 @@
+"""
+Generative AI benchmark schemas for performance measurement and analysis.
+
+This module provides the complete schema ecosystem for executing, tracking, and
+analyzing generative AI benchmarks. It encompasses configuration entrypoints for
+benchmark setup, real-time metric accumulators for execution monitoring,
+comprehensive result containers with statistical summaries, and multi-benchmark
+reporting capabilities. The schemas support domain-specific metrics for text,
+image, video, and audio generation tasks, enabling detailed performance analysis
+including throughput, latency distributions, concurrency patterns, and scheduler
+behavior tracking across successful, incomplete, and errored requests.
+"""
+
+from __future__ import annotations
+
+from .accumulator import (
+    GenerativeBenchmarkAccumulator,
+    GenerativeBenchmarkTimings,
+    GenerativeMetricsAccumulator,
+    GenerativeRequestsAccumulator,
+    RunningMetricStats,
+    SchedulerMetricsAccumulator,
+)
+from .benchmark import GenerativeBenchmark
+from .entrypoints import BenchmarkGenerativeTextArgs
+from .metrics import (
+    GenerativeAudioMetricsSummary,
+    GenerativeImageMetricsSummary,
+    GenerativeMetrics,
+    GenerativeMetricsSummary,
+    GenerativeTextMetricsSummary,
+    GenerativeVideoMetricsSummary,
+    SchedulerMetrics,
+)
+from .report import GenerativeBenchmarksReport
+
+__all__ = [
+    "BenchmarkGenerativeTextArgs",
+    "GenerativeAudioMetricsSummary",
+    "GenerativeBenchmark",
+    "GenerativeBenchmarkAccumulator",
+    "GenerativeBenchmarkTimings",
+    "GenerativeBenchmarksReport",
+    "GenerativeImageMetricsSummary",
+    "GenerativeMetrics",
+    "GenerativeMetricsAccumulator",
+    "GenerativeMetricsSummary",
+    "GenerativeRequestsAccumulator",
+    "GenerativeTextMetricsSummary",
+    "GenerativeVideoMetricsSummary",
+    "RunningMetricStats",
+    "SchedulerMetrics",
+    "SchedulerMetricsAccumulator",
+]
diff --git a/src/guidellm/benchmark/schemas/generative/accumulator.py b/src/guidellm/benchmark/schemas/generative/accumulator.py
new file mode 100644
index 00000000..20ef08c0
--- /dev/null
+++ b/src/guidellm/benchmark/schemas/generative/accumulator.py
@@ -0,0 +1,847 @@
+"""
+Real-time metric accumulation for generative benchmark execution.
+
+Captures and computes performance metrics during benchmark runs, tracking timing phases,
+request statistics, token throughput, and latency distributions. Components include
+timing trackers for warmup/cooldown phases, running statistical accumulators for
+throughput and latency metrics, and reservoir sampling for request data. Enables
+comprehensive performance measurement including scheduler overhead, time-to-first-token,
+inter-token latency, and token generation rates across completed, errored, and
+incomplete requests.
+"""
+
+from __future__ import annotations
+
+import random
+import time
+from typing import Literal
+
+from pydantic import Field
+
+from guidellm.benchmark.schemas.base import BenchmarkAccumulator, BenchmarkConfig
+from guidellm.scheduler import MultiTurnRequestT, SchedulerState
+from guidellm.schemas import (
+    GenerationRequest,
+    GenerationResponse,
+    GenerativeRequestStats,
+    RequestInfo,
+    RequestTimings,
+    StandardBaseModel,
+    StatusBreakdown,
+)
+
+__all__ = [
+    "GenerativeBenchmarkAccumulator",
+    "GenerativeBenchmarkTimings",
+    "GenerativeMetricsAccumulator",
+    "GenerativeRequestsAccumulator",
+    "RunningMetricStats",
+    "SchedulerMetricsAccumulator",
+]
+
+
+class GenerativeBenchmarkTimings(StandardBaseModel):
+    """
+    Tracks timing phases and transitions during benchmark execution.
+
+    Monitors timestamps throughout benchmark execution including request submission,
+    measurement period boundaries (warmup/active/cooldown), and completion events.
+    Provides duration calculations and phase status determination based on configured
+    warmup and cooldown periods.
+    """
+
+    request_start: float | None = Field(
+        description="Timestamp when the first request was sent", default=None
+    )
+    measure_start: float | None = Field(
+        description="Timestamp when measurement period started", default=None
+    )
+    measure_end: float | None = Field(
+        description="Timestamp when measurement period ended", default=None
+    )
+    request_end: float | None = Field(
+        description="Timestamp when the last request was completed", default=None
+    )
+    current_update: float | None = Field(
+        description="Most recent timestamp observed during execution", default=None
+    )
+    current_request: float | None = Field(
+        description="Most recent request completion timestamp observed", default=None
+    )
+    last_update: float | None = Field(
+        description="Previous timestamp observed before the current one", default=None
+    )
+    last_request: float | None = Field(
+        description="Previous request completion timestamp before the current one",
+        default=None,
+    )
+
+    @property
+    def status(self) -> Literal["pending", "warmup", "active", "cooldown"]:
+        """
+        :return: Current execution phase based on timing thresholds
+        """
+        if self.request_start is None:
+            return "pending"
+
+        if self.measure_start is None:
+            return "warmup"
+
+        if self.measure_end is None:
+            return "active"
+
+        return "cooldown"
+
+    @property
+    def duration(self) -> float:
+        """
+        :return: Elapsed time since measurement or request start in seconds
+        """
+        if self.current_update is None:
+            return 0.0
+
+        start_time = self.measure_start or self.request_start
+
+        return (self.current_update - start_time) if start_time is not None else 0.0
+
+    @property
+    def elapsed_time_last_update(self) -> float:
+        """
+        :return: Time elapsed between the last two update timestamps in seconds
+        """
+        if self.current_update is None or self.last_update is None:
+            return 0.0
+
+        return self.current_update - self.last_update
+
+    @property
+    def elapsed_time_last_request(self) -> float:
+        """
+        :return: Time elapsed between the last two request completions in seconds
+        """
+        if self.current_request is None or self.last_request is None:
+            return 0.0
+
+        return self.current_request - self.last_request
+
+    def update_estimate(
+        self,
+        info: RequestInfo,
+        scheduler_state: SchedulerState,
+        config: BenchmarkConfig,
+    ):
+        """
+        Update timing estimates based on request info and scheduler state.
+
+        Advances timing markers through benchmark phases (warmup to active to cooldown)
+        based on configured thresholds. Updates current/last timestamps for updates and
+        request completions, determining measurement period boundaries.
+
+        :param info: Request information containing timing data
+        :param scheduler_state: Current scheduler state with progress metrics
+        :param config: Benchmark configuration with warmup/cooldown settings
+        """
+        request_start = info.timings.request_start or info.timings.resolve_start
+        request_end = info.timings.request_end or info.timings.resolve_end
+        current_time = info.timings.last_reported
+
+        self.request_start = self.request_start or request_start
+
+        if request_end is not None and (
+            self.request_end is None or request_end > self.request_end
+        ):
+            # Always update request end to the max seen so far
+            self.request_end = request_end
+
+        # Update last and current update times
+        self.last_update = self.current_update
+        if current_time is not None and (
+            self.current_update is None or current_time > self.current_update
+        ):
+            self.current_update = current_time
+
+        # Update last and current request times, if applicable
+        if info.status in {"completed", "errored", "cancelled"}:
+            self.last_request = self.current_request
+            if request_end is not None and (
+                self.current_request is None or request_end > self.current_request
+            ):
+                self.current_request = request_end
+
+        # Update measurement start time based on warmup configuration
+        if config.warmup is not None and self.measure_start is None:
+            exceeded_time = (
+                config.warmup >= 1.0
+                and scheduler_state.remaining_duration is not None
+                and self.duration is not None
+                and self.duration >= config.warmup
+            )
+            exceeded_count = (
+                config.warmup >= 1.0
+                and scheduler_state.remaining_requests is not None
+                and scheduler_state.processed_requests >= config.warmup
+            )
+            exceeded_fraction = (
+                config.warmup < 1.0
+                and scheduler_state.remaining_fraction is not None
+                and 1.0 - scheduler_state.remaining_fraction >= config.warmup
+            )
+
+            if exceeded_time or exceeded_count or exceeded_fraction:
+                self.measure_start = self.current_update
+        elif config.warmup is None and self.measure_start is None:
+            # No warmup configured, start measuring at first request
+            self.measure_start = self.request_start
+
+        # Update measurement end time based on cooldown configuration
+        if config.cooldown is not None and self.measure_end is None:
+            exceeded_time = (
+                config.cooldown >= 1.0
+                and scheduler_state.remaining_duration is not None
+                and scheduler_state.remaining_duration <= config.cooldown
+            )
+            exceeded_count = (
+                config.cooldown >= 1.0
+                and scheduler_state.remaining_requests is not None
+                and scheduler_state.remaining_requests <= config.cooldown
+            )
+            exceeded_fraction = (
+                config.cooldown < 1.0
+                and scheduler_state.remaining_fraction is not None
+                and scheduler_state.remaining_fraction <= config.cooldown
+            )
+
+            if exceeded_time or exceeded_count or exceeded_fraction:
+                self.measure_end = self.current_update
+
+
+class RunningMetricStats(StandardBaseModel):
+    """
+    Maintains running statistics for a metric stream without storing all samples.
+
+    Accumulates count, sum, time-weighted sum, and duration to compute mean, rate,
+    and time-weighted statistics incrementally. Efficient for real-time metric tracking
+    during long-running benchmarks where storing individual samples is impractical.
+    """
+
+    count: int = Field(description="Number of samples accumulated", default=0)
+    value_sum: float = Field(description="Total sum of accumulated values", default=0.0)
+    time_weighted_sum: float = Field(
+        description="Time-weighted sum of accumulated values", default=0.0
+    )
+    duration: float = Field(
+        description="Total duration over which values were accumulated", default=0.0
+    )
+    last_value: float | None = Field(
+        description="Most recent value added to the accumulator", default=None
+    )
+
+    @property
+    def mean(self) -> float | None:
+        """
+        :return: Arithmetic mean of accumulated values, or None if no samples
+        """
+        if self.count <= 0:
+            return None
+
+        return self.value_sum / self.count
+
+    @property
+    def time_weighted_mean(self) -> float | None:
+        """
+        :return: Time-weighted mean considering duration between samples, or None
+        """
+        if self.duration <= 0.0:
+            return None
+
+        return self.time_weighted_sum / self.duration
+
+    @property
+    def rate_per_item(self) -> float | None:
+        """
+        :return: Average value per accumulated item, or None if no samples
+        """
+        if self.count <= 0:
+            return None
+
+        return self.value_sum / self.count
+
+    @property
+    def rate_per_second(self) -> float | None:
+        """
+        :return: Average value per second of duration, or None if no duration
+        """
+        if self.duration <= 0.0:
+            return None
+
+        return self.value_sum / self.duration
+
+    def update_estimate(
+        self,
+        value: float | None,
+        count: int = 1,
+        duration: float | None = None,
+        elapsed: float | None = None,
+    ):
+        """
+        Incorporate a new metric value into running statistics.
+
+        Updates count, sum, and time-weighted statistics using the new value and timing
+        information. Time-weighted calculations use the previous value over the elapsed
+        interval to capture sustained metric behavior.
+
+        :param value: New metric value to accumulate
+        :param count: Number of occurrences this value represents
+        :param duration: Total duration to set, overriding incremental elapsed updates
+        :param elapsed: Time elapsed since last update for time-weighted calculations
+        """
+        self.count += count
+        self.value_sum += (value or 0.0) * count
+
+        if elapsed is not None:
+            self.time_weighted_sum += (self.last_value or 0.0) * elapsed
+
+        self.duration = (
+            duration if duration is not None else (self.duration + (elapsed or 0.0))
+        )
+        self.last_value = value
+
+
+class SchedulerMetricsAccumulator(StandardBaseModel):
+    """
+    Tracks scheduler-level timing and overhead metrics during execution.
+
+    Monitors request lifecycle timing from queuing through completion, capturing delays
+    at each stage: queue time, worker start delays, request processing time, and
+    finalization overhead. Provides insight into scheduler efficiency and bottleneck
+    identification in request orchestration.
+    """
+
+    requests_made: StatusBreakdown[int, int, int, int] = Field(
+        description="Request counts by status: successful, incomplete, errored, total",
+        default_factory=lambda: StatusBreakdown[int, int, int, int](
+            successful=0, errored=0, incomplete=0, total=0
+        ),
+    )
+    # Timings flow:
+    # Request scheduling: queued->dequeued->scheduled_at->resolve_start->
+    # Request processing: request_start->*_iteration->request_end->
+    # Request finalizing: resolve_end->finalized->accumulation update processed
+    queued_time: RunningMetricStats = Field(
+        default_factory=RunningMetricStats,
+        description="Running stats for time requests spent in the queue",
+    )
+    resolve_start_delay: RunningMetricStats = Field(
+        default_factory=RunningMetricStats,
+        description=(
+            "Running stats for delay before worker begins resolving req after dequeue"
+        ),
+    )
+    resolve_targeted_start_delay: RunningMetricStats = Field(
+        default_factory=RunningMetricStats,
+        description=(
+            "Running stats for delay from targeted start to actual worker start"
+        ),
+    )
+    request_start_delay: RunningMetricStats = Field(
+        default_factory=RunningMetricStats,
+        description="Running stats for delay after resolve til request start",
+    )
+    request_targeted_start_delay: RunningMetricStats = Field(
+        default_factory=RunningMetricStats,
+        description=(
+            "Running stats for delay from targeted start to actual request start"
+        ),
+    )
+    request_time: RunningMetricStats = Field(
+        default_factory=RunningMetricStats,
+        description="Running stats for request processing time",
+    )
+    resolve_end_delay: RunningMetricStats = Field(
+        default_factory=RunningMetricStats,
+        description="Running stats for delay after request end till worker resolves",
+    )
+    resolve_time: RunningMetricStats = Field(
+        default_factory=RunningMetricStats,
+        description="Running stats for time for worker to resolve requests",
+    )
+    finalized_delay: RunningMetricStats = Field(
+        default_factory=RunningMetricStats,
+        description="Running stats for delay after resolve til finalized in scheduler",
+    )
+    processed_delay: RunningMetricStats = Field(
+        default_factory=RunningMetricStats,
+        description=(
+            "Running stats for delay from finalized til request being "
+            "processed by accumulation"
+        ),
+    )
+
+    def update_estimate(
+        self, scheduler_state: SchedulerState, stats: GenerativeRequestStats
+    ):
+        """
+        Update scheduler metrics with completed request timing data.
+
+        Extracts timing information from request statistics to update running metrics
+        for each scheduler lifecycle stage. Validates that required timing markers are
+        present before processing.
+
+        :param scheduler_state: Current scheduler state with request counts
+        :param stats: Completed request statistics with detailed timing information
+        :raises ValueError: If required timing markers are missing
+        """
+        # Update request counts
+        self.requests_made.successful = scheduler_state.successful_requests
+        self.requests_made.errored = scheduler_state.errored_requests
+        self.requests_made.incomplete = scheduler_state.cancelled_requests
+        self.requests_made.total = (
+            scheduler_state.successful_requests
+            + scheduler_state.errored_requests
+            + scheduler_state.cancelled_requests
+        )
+
+        # All requests must have queued, dequeued, resolve_end, and finalized timings
+        timings: RequestTimings = stats.info.timings
+        if any(
+            timing is None
+            for timing in [
+                timings.queued,
+                timings.dequeued,
+                timings.resolve_end,
+                timings.finalized,
+            ]
+        ):
+            raise ValueError(
+                "Required timings 'queued', 'dequeued', 'resolve_end', and "
+                "'finalized' must not be None"
+            )
+
+        # Store validated non-None timings for type safety
+        queued: float = timings.queued  # type: ignore[assignment]
+        dequeued: float = timings.dequeued  # type: ignore[assignment]
+        resolve_end: float = timings.resolve_end  # type: ignore[assignment]
+        finalized: float = timings.finalized  # type: ignore[assignment]
+
+        # Update timing metrics in occurrence order
+        self.queued_time.update_estimate(value=dequeued - queued)
+
+        if timings.scheduled_at is not None and timings.resolve_start is not None:
+            self.resolve_start_delay.update_estimate(
+                value=timings.resolve_start - timings.scheduled_at
+            )
+
+        if timings.targeted_start is not None and timings.resolve_start is not None:
+            self.resolve_targeted_start_delay.update_estimate(
+                value=timings.resolve_start - timings.targeted_start
+            )
+
+        if timings.resolve_start is not None and timings.request_start is not None:
+            self.request_start_delay.update_estimate(
+                value=timings.request_start - timings.resolve_start
+            )
+
+        if timings.targeted_start is not None and timings.request_start is not None:
+            self.request_targeted_start_delay.update_estimate(
+                value=timings.request_start - timings.targeted_start
+            )
+
+        if timings.request_start is not None and timings.request_end is not None:
+            self.request_time.update_estimate(
+                value=timings.request_end - timings.request_start
+            )
+
+        if timings.request_end is not None:
+            self.resolve_end_delay.update_estimate(
+                value=resolve_end - timings.request_end
+            )
+
+        if timings.resolve_start is not None:
+            self.resolve_time.update_estimate(value=resolve_end - timings.resolve_start)
+
+        self.finalized_delay.update_estimate(value=finalized - resolve_end)
+        self.processed_delay.update_estimate(value=time.time() - finalized)
+
+
+class GenerativeMetricsAccumulator(StandardBaseModel):
+    """
+    Accumulates generative model performance metrics during execution.
+
+    Tracks token throughput, latency characteristics, and request timing for generative
+    workloads. Maintains running statistics for input/output tokens,
+    time-to-first-token, inter-token latency, and streaming patterns for comprehensive
+    performance analysis.
+    """
+
+    requests: RunningMetricStats = Field(
+        default_factory=RunningMetricStats,
+        description="Accumulated request count statistics",
+    )
+    request_latency: RunningMetricStats = Field(
+        default_factory=RunningMetricStats,
+        description="Accumulated request latency statistics",
+    )
+    input_tokens: RunningMetricStats = Field(
+        default_factory=RunningMetricStats,
+        description="Accumulated input token count statistics",
+    )
+    output_tokens: RunningMetricStats = Field(
+        default_factory=RunningMetricStats,
+        description="Accumulated output token count statistics",
+    )
+    total_tokens: RunningMetricStats = Field(
+        default_factory=RunningMetricStats,
+        description="Accumulated total token count statistics",
+    )
+    time_to_first_token_ms: RunningMetricStats = Field(
+        default_factory=RunningMetricStats,
+        description="Accumulated time to first token statistics in milliseconds",
+    )
+    time_per_output_token_ms: RunningMetricStats = Field(
+        default_factory=RunningMetricStats,
+        description="Accumulated time per output token statistics in milliseconds",
+    )
+    inter_token_latency_ms: RunningMetricStats = Field(
+        default_factory=RunningMetricStats,
+        description="Accumulated inter-token latency statistics in milliseconds",
+    )
+    streaming_iterations: RunningMetricStats = Field(
+        default_factory=RunningMetricStats,
+        description="Accumulated streaming iteration count statistics",
+    )
+    output_tokens_by_iteration: RunningMetricStats = Field(
+        default_factory=RunningMetricStats,
+        description="Accumulated output tokens per iteration statistics",
+    )
+    iter_tokens_by_iteration: RunningMetricStats = Field(
+        default_factory=RunningMetricStats,
+        description="Accumulated iteration tokens per iteration statistics",
+    )
+
+    def update_estimate(self, stats: GenerativeRequestStats, duration: float):
+        """
+        Update generative metrics with completed request statistics.
+
+        Incorporates token counts, latency measurements, and streaming characteristics
+        from a completed request into running metric accumulators with time-weighted
+        calculations.
+
+        :param stats: Request statistics containing token and latency measurements
+        :param duration: Current benchmark duration for time-weighted metrics
+        """
+        self.requests.update_estimate(1.0, duration=duration)
+        self.input_tokens.update_estimate(stats.input_tokens, duration=duration)
+        self.output_tokens.update_estimate(stats.output_tokens, duration=duration)
+        self.total_tokens.update_estimate(stats.total_tokens, duration=duration)
+        self.request_latency.update_estimate(stats.request_latency, duration=duration)
+        self.time_to_first_token_ms.update_estimate(
+            stats.time_to_first_token_ms, duration=duration
+        )
+        self.time_per_output_token_ms.update_estimate(
+            stats.time_per_output_token_ms,
+            count=int(stats.output_tokens or 0),
+            duration=duration,
+        )
+        self.inter_token_latency_ms.update_estimate(
+            stats.inter_token_latency_ms,
+            count=int((stats.output_tokens or 1) - 1),
+            duration=duration,
+        )
+        self.streaming_iterations.update_estimate(
+            stats.token_iterations, duration=duration
+        )
+        self.output_tokens_by_iteration.update_estimate(
+            stats.output_tokens_per_iteration,
+            count=int(stats.token_iterations or 0),
+            duration=duration,
+        )
+        self.iter_tokens_by_iteration.update_estimate(
+            stats.iter_tokens_per_iteration,
+            count=int((stats.token_iterations or 1) - 1),
+            duration=duration,
+        )
+
+
+class GenerativeRequestsAccumulator(StandardBaseModel):
+    """
+    Manages request statistics collection with optional reservoir sampling.
+
+    Collects detailed request statistics while optionally sampling to limit memory usage
+    in long-running benchmarks. Supports configurable sampling rates and selective data
+    retention (clearing request arguments and/or outputs for non-sampled requests).
+    """
+
+    sample_requests: int | None = Field(
+        default=None,
+        description=(
+            "Number of requests to sample and keep in the final benchmark for metrics"
+        ),
+    )
+    requests_stats: list[GenerativeRequestStats] = Field(
+        description="List of generative request statistics", default_factory=list
+    )
+    samples: list[int] | None = Field(
+        description="Indices of sampled generative requests", default=None
+    )
+    clear_nonsampled_request_args: bool = Field(
+        default=True,
+        description=(
+            "Whether to clear request arguments and outputs for non-sampled requests"
+        ),
+    )
+    clear_nonsampled_outputs: bool = Field(
+        default=True,
+        description=(
+            "Whether to clear outputs for non-sampled requests while keeping args"
+        ),
+    )
+
+    def get_sampled(self) -> list[GenerativeRequestStats]:
+        """
+        Retrieve the list of sampled request statistics.
+
+        :return: List of sampled generative request statistics
+        """
+        if self.samples is None:
+            return self.requests_stats
+
+        return [self.requests_stats[ind] for ind in self.samples]
+
+    def get_within_range(
+        self, start_time: float, end_time: float
+    ) -> list[GenerativeRequestStats]:
+        """
+        Retrieve request statistics within a specified time range.
+
+        :param start_time: Start timestamp for filtering (requests must end after this)
+        :param end_time: End timestamp for filtering (requests must start before this)
+        :return: List of request statistics within the time range
+        """
+        return [
+            stats
+            for stats in self.requests_stats
+            if (stats.request_end_time >= start_time)
+            and (
+                (
+                    stats.request_start_time is not None
+                    and stats.request_start_time <= end_time
+                )
+                or (
+                    stats.request_start_time is None
+                    and stats.request_end_time <= end_time
+                )
+            )
+        ]
+
+    def update_estimate(
+        self,
+        response: GenerationResponse | None,
+        request: GenerationRequest | MultiTurnRequestT[GenerationRequest],
+        info: RequestInfo,
+        prefer_response_metrics: bool,
+    ) -> GenerativeRequestStats:
+        """
+        Record request statistics and apply reservoir sampling if configured.
+
+        Compiles statistics from the completed request and adds to the collection.
+        Uses reservoir sampling algorithm to maintain uniform sample distribution when
+        enabled, clearing non-sampled request data to manage memory.
+
+        :param response: Generation response containing output and metrics
+        :param request: Original generation request with input data
+        :param info: Request execution information and timing
+        :param prefer_response_metrics: Whether to prefer metrics from response
+        :return: Compiled request statistics
+        """
+        stats = self.compile_stats(response, request, info, prefer_response_metrics)
+
+        current_index = len(self.requests_stats)
+        self.requests_stats.append(stats)
+
+        if self.sample_requests is None:
+            # Keeping all requests, don't need to sample
+            self.samples = None
+        elif self.sample_requests <= 0:
+            # Not keeping any requests, clear out unnecessary memory usage for current
+            self.clear_stats_data(stats)
+        elif self.sample_requests >= len(self.requests_stats):
+            # Add directly to samples, haven't filled yet
+            if self.samples is None:
+                self.samples = []
+            self.samples.append(current_index)
+        elif self.sample_requests / len(self.requests_stats) >= random.random():
+            # Sampling logic: choose to replace with decreasing probability s / n
+            # where s is sample size, n is current number of requests.
+            # If chosen, choose random existing sample to replace.
+            # P(new item in samples)  = s / n
+            # P(prev item in samples) = P(item was in samples) * P(not replaced)
+            # P(prev item in samples) =
+            #    P(before replacement) * P(new item selected) * P(chosen from samples)
+            # P(prev item in samples) = (s / (n - 1)) * (s / n) * (1 / s) = s / n
+            # P(prev item in samples) = P(new item in samples)
+            if self.samples is None:
+                self.samples = []
+            replace_index = random.randrange(len(self.samples))
+            self.clear_stats_data(self.samples[replace_index])
+            self.samples[replace_index] = current_index
+
+        return stats
+
+    def clear_stats_data(self, stats: GenerativeRequestStats | int):
+        if isinstance(stats, int):
+            stats = self.requests_stats[stats]
+
+        if self.clear_nonsampled_request_args:
+            stats.request_args = None
+        if self.clear_nonsampled_outputs:
+            stats.output = None
+
+    @classmethod
+    def compile_stats(
+        cls,
+        response: GenerationResponse | None,
+        request: GenerationRequest | MultiTurnRequestT[GenerationRequest],
+        info: RequestInfo,
+        prefer_response_metrics: bool,
+    ) -> GenerativeRequestStats:
+        """
+        Compile statistics from request, response, and execution info.
+
+        :param response: Generation response with output and metrics, or None
+        :param request: Original generation request with input data
+        :param info: Request execution information and timing
+        :param prefer_response_metrics: Whether to prefer metrics from response
+        :return: Compiled generative request statistics
+        """
+        # Extract the first request for arguments if multi-turn
+        first_request: GenerationRequest
+        if isinstance(request, GenerationRequest):
+            first_request = request
+        else:
+            # Multi-turn request: extract first item
+            first_item = request[0]
+            first_request = (
+                first_item[0] if isinstance(first_item, tuple) else first_item
+            )
+
+        if response is None:
+            response = GenerationResponse(
+                request_id=info.request_id, request_args=str(first_request.arguments)
+            )
+
+        return response.compile_stats(
+            request=first_request,
+            info=info,
+            prefer_response=prefer_response_metrics,
+        )
+
+
+class GenerativeBenchmarkAccumulator(
+    BenchmarkAccumulator[GenerationRequest, GenerationResponse]
+):
+    """
+    Primary accumulator for generative benchmark execution metrics and statistics.
+
+    Orchestrates real-time metric collection across timing, scheduler, concurrency, and
+    generative performance dimensions. Maintains separate accumulators for completed,
+    errored, and incomplete requests while tracking overall metrics. Integrates with
+    scheduler state to monitor warmup/cooldown phases and compute time-weighted
+    statistics for throughput and latency analysis.
+    """
+
+    timings: GenerativeBenchmarkTimings = Field(
+        default_factory=GenerativeBenchmarkTimings,
+        description="Timing phases and transitions during benchmark execution",
+    )
+    completed: GenerativeRequestsAccumulator = Field(
+        default_factory=GenerativeRequestsAccumulator,
+        description="Accumulator for completed requests",
+    )
+    errored: GenerativeRequestsAccumulator = Field(
+        default_factory=GenerativeRequestsAccumulator,
+        description="Accumulator for errored requests",
+    )
+    incomplete: GenerativeRequestsAccumulator = Field(
+        default_factory=GenerativeRequestsAccumulator,
+        description="Accumulator for incomplete requests",
+    )
+    scheduler_metrics: SchedulerMetricsAccumulator = Field(
+        default_factory=SchedulerMetricsAccumulator,
+        description="Running metrics for scheduler state",
+    )
+    concurrency_metric: RunningMetricStats = Field(
+        default_factory=RunningMetricStats,
+        description="Accumulated request concurrency statistics",
+    )
+    total_metrics: GenerativeMetricsAccumulator = Field(
+        default_factory=GenerativeMetricsAccumulator,
+        description="Running metrics for all requests",
+    )
+    completed_metrics: GenerativeMetricsAccumulator = Field(
+        default_factory=GenerativeMetricsAccumulator,
+        description="Running metrics for completed requests",
+    )
+    errored_metrics: GenerativeMetricsAccumulator = Field(
+        default_factory=GenerativeMetricsAccumulator,
+        description="Running metrics for errored requests",
+    )
+    incomplete_metrics: GenerativeMetricsAccumulator = Field(
+        default_factory=GenerativeMetricsAccumulator,
+        description="Running metrics for incomplete requests",
+    )
+
+    def update_estimate(
+        self,
+        response: GenerationResponse | None,
+        request: GenerationRequest | MultiTurnRequestT[GenerationRequest],
+        info: RequestInfo,
+        scheduler_state: SchedulerState,
+    ):
+        """
+        Update all benchmark metrics with a completed request.
+
+        Processes request completion by updating timing phases, concurrency metrics,
+        scheduler statistics, and generative performance metrics. Routes request to
+        appropriate status-specific accumulator (completed/errored/incomplete) and
+        updates aggregate totals. Cancelled requests that never started are ignored.
+
+        :param response: Generation response with output and metrics, or None
+        :param request: Original generation request with input data
+        :param info: Request execution information and timing
+        :param scheduler_state: Current scheduler state for phase tracking
+        """
+        if info.status == "cancelled" and info.timings.resolve_start is None:
+            # Cancelled requests that never started should be ignored
+            return
+
+        self.timings.update_estimate(info, scheduler_state, self.config)
+
+        duration = self.timings.duration
+        elapsed_time_last_update = self.timings.elapsed_time_last_update
+        self.concurrency_metric.update_estimate(
+            value=scheduler_state.processing_requests,
+            duration=duration,
+            elapsed=elapsed_time_last_update,
+        )
+
+        requests_accumulator: GenerativeRequestsAccumulator
+        metrics_accumulator: GenerativeMetricsAccumulator
+
+        if info.status == "completed":
+            requests_accumulator = self.completed
+            metrics_accumulator = self.completed_metrics
+        elif info.status == "errored":
+            requests_accumulator = self.errored
+            metrics_accumulator = self.errored_metrics
+        elif info.status == "cancelled":
+            requests_accumulator = self.incomplete
+            metrics_accumulator = self.incomplete_metrics
+        else:
+            return
+
+        stats = requests_accumulator.update_estimate(
+            response, request, info, self.config.prefer_response_metrics
+        )
+        metrics_accumulator.update_estimate(stats, duration)
+        self.total_metrics.update_estimate(stats, duration)
+        self.scheduler_metrics.update_estimate(scheduler_state, stats)
diff --git a/src/guidellm/benchmark/schemas/generative/benchmark.py b/src/guidellm/benchmark/schemas/generative/benchmark.py
new file mode 100644
index 00000000..1d7f83ca
--- /dev/null
+++ b/src/guidellm/benchmark/schemas/generative/benchmark.py
@@ -0,0 +1,141 @@
+"""
+Benchmark data models and metrics for generative AI performance measurement.
+
+Provides comprehensive data structures for capturing, storing, and analyzing
+benchmark results from scheduler-driven generative AI workload executions.
+Core abstractions include base benchmark interfaces, generative-specific
+metrics with token/latency distributions, request-level statistics tracking,
+and multi-benchmark reporting capabilities. These models enable detailed
+performance analysis including throughput, latency, concurrency patterns, and
+domain-specific metrics for text, image, video, and audio generation tasks.
+"""
+
+from __future__ import annotations
+
+from typing import Literal
+
+from pydantic import Field, computed_field
+
+from guidellm.benchmark.schemas.base import Benchmark, BenchmarkConfig
+from guidellm.benchmark.schemas.generative.accumulator import (
+    GenerativeBenchmarkAccumulator,
+)
+from guidellm.benchmark.schemas.generative.metrics import (
+    GenerativeMetrics,
+    SchedulerMetrics,
+)
+from guidellm.scheduler import SchedulerState
+from guidellm.schemas import (
+    GenerativeRequestStats,
+    StatusBreakdown,
+    StatusDistributionSummary,
+)
+
+__all__ = ["GenerativeBenchmark"]
+
+
+class GenerativeBenchmark(Benchmark[GenerativeBenchmarkAccumulator]):
+    """
+    Complete generative AI benchmark results with specialized metrics.
+
+    Encapsulates comprehensive performance data from scheduler-driven generative
+    workload executions including request-level statistics, token/latency distributions,
+    throughput analysis, and concurrency patterns. Provides computed fields for temporal
+    analysis and status-grouped request details for detailed post-execution reporting.
+    """
+
+    type_: Literal["generative_benchmark"] = "generative_benchmark"  # type: ignore[assignment]
+
+    config: BenchmarkConfig = Field(
+        description="Configuration parameters for this benchmark execution",
+    )
+    scheduler_state: SchedulerState = Field(
+        description="Final state of the scheduler after benchmark completion",
+    )
+    scheduler_metrics: SchedulerMetrics = Field(
+        description="Scheduler timing and performance statistics",
+    )
+    metrics: GenerativeMetrics = Field(
+        description="Performance metrics and statistical distributions",
+    )
+    requests: StatusBreakdown[
+        list[GenerativeRequestStats],
+        list[GenerativeRequestStats],
+        list[GenerativeRequestStats],
+        None,
+    ] = Field(
+        description=(
+            "Request details grouped by status: successful, incomplete, errored"
+        ),
+    )
+
+    @computed_field  # type: ignore[prop-decorator]
+    @property
+    def start_time(self) -> float:
+        """
+        :return: Benchmark start time in seconds since epoch
+        """
+        return self.scheduler_metrics.measure_start_time
+
+    @computed_field  # type: ignore[prop-decorator]
+    @property
+    def end_time(self) -> float:
+        """
+        :return: Benchmark end time in seconds since epoch
+        """
+        return self.scheduler_metrics.measure_end_time
+
+    @computed_field  # type: ignore[prop-decorator]
+    @property
+    def duration(self) -> float:
+        """
+        :return: Total benchmark execution duration in seconds
+        """
+        return self.end_time - self.start_time
+
+    @property
+    def request_latency(self) -> StatusDistributionSummary:
+        """
+        :return: Statistical distribution of request latencies across all requests
+        """
+        return self.metrics.request_latency
+
+    @property
+    def request_throughput(self) -> StatusDistributionSummary:
+        """
+        :return: Statistical distribution of throughput measured in requests per second
+        """
+        return self.metrics.requests_per_second
+
+    @property
+    def request_concurrency(self) -> StatusDistributionSummary:
+        """
+        :return: Statistical distribution of concurrent requests throughout execution
+        """
+        return self.metrics.request_concurrency
+
+    @classmethod
+    def compile(
+        cls,
+        accumulator: GenerativeBenchmarkAccumulator,
+        scheduler_state: SchedulerState,
+    ) -> GenerativeBenchmark:
+        """
+        Compile final benchmark results from accumulated execution state.
+
+        :param accumulator: Accumulated benchmark state with request statistics
+        :param scheduler_state: Final scheduler state after execution completion
+        :return: Compiled generative benchmark instance with complete metrics
+        """
+        return GenerativeBenchmark(
+            config=accumulator.config,
+            scheduler_state=scheduler_state,
+            scheduler_metrics=SchedulerMetrics.compile(accumulator, scheduler_state),
+            metrics=GenerativeMetrics.compile(accumulator),
+            requests=StatusBreakdown(
+                successful=accumulator.completed.get_sampled(),
+                incomplete=accumulator.incomplete.get_sampled(),
+                errored=accumulator.errored.get_sampled(),
+                total=None,
+            ),
+        )
diff --git a/src/guidellm/benchmark/schemas/generative/entrypoints.py b/src/guidellm/benchmark/schemas/generative/entrypoints.py
new file mode 100644
index 00000000..c54b93b9
--- /dev/null
+++ b/src/guidellm/benchmark/schemas/generative/entrypoints.py
@@ -0,0 +1,361 @@
+"""
+Configuration entrypoints for generative text benchmark execution.
+
+Defines parameter schemas and construction logic for creating benchmark runs from
+scenario files or runtime arguments. Provides flexible configuration loading with
+support for built-in scenarios, custom YAML/JSON files, and programmatic overrides.
+Handles serialization of complex types including backends, processors, and profiles
+for persistent storage and reproduction of benchmark configurations.
+"""
+
+from __future__ import annotations
+
+import inspect
+import json
+from collections.abc import Callable
+from pathlib import Path
+from typing import Any, Literal
+
+import yaml
+from pydantic import (
+    AliasChoices,
+    AliasGenerator,
+    ConfigDict,
+    Field,
+    ValidationError,
+    ValidatorFunctionWrapHandler,
+    field_validator,
+    model_serializer,
+)
+from torch.utils.data import Sampler
+from transformers import PreTrainedTokenizerBase
+
+from guidellm.backends import Backend, BackendType
+from guidellm.benchmark.profile import Profile, ProfileType
+from guidellm.benchmark.scenarios import get_builtin_scenarios
+from guidellm.data import DatasetPreprocessor, RequestFormatter
+from guidellm.scheduler import StrategyType
+from guidellm.schemas import StandardBaseModel
+
+__all__ = ["BenchmarkGenerativeTextArgs"]
+
+
+class BenchmarkGenerativeTextArgs(StandardBaseModel):
+    """
+    Configuration arguments for generative text benchmark execution.
+
+    Defines all parameters for benchmark setup including target endpoint, data
+    sources, backend configuration, processing pipeline, output formatting, and
+    execution constraints. Supports loading from scenario files and merging with
+    runtime overrides for flexible benchmark construction from multiple sources.
+
+    Example::
+
+        # Load from built-in scenario with overrides
+        args = BenchmarkGenerativeTextArgs.create(
+            scenario="chat",
+            target="http://localhost:8000/v1",
+            max_requests=1000
+        )
+
+        # Create from keyword arguments only
+        args = BenchmarkGenerativeTextArgs(
+            target="http://localhost:8000/v1",
+            data=["path/to/dataset.json"],
+            profile="fixed",
+            rate=10.0
+        )
+    """
+
+    @classmethod
+    def create(
+        cls, scenario: Path | str | None, **kwargs: dict[str, Any]
+    ) -> BenchmarkGenerativeTextArgs:
+        """
+        Create benchmark args from scenario file and keyword arguments.
+
+        Loads base configuration from scenario file (built-in or custom) and merges
+        with provided keyword arguments. Arguments explicitly set via kwargs override
+        scenario values, while defaulted kwargs are ignored to preserve scenario
+        settings.
+
+        :param scenario: Path to scenario file, built-in scenario name, or None
+        :param kwargs: Keyword arguments to override scenario values
+        :return: Configured benchmark args instance
+        :raises ValueError: If scenario is not found or file format is unsupported
+        """
+        constructor_kwargs = {}
+
+        if scenario is not None:
+            if isinstance(scenario, str) and scenario in (
+                builtin_scenarios := get_builtin_scenarios()
+            ):
+                scenario_path = builtin_scenarios[scenario]
+            elif Path(scenario).exists() and Path(scenario).is_file():
+                scenario_path = Path(scenario)
+            else:
+                raise ValueError(f"Scenario '{scenario}' not found.")
+
+            with scenario_path.open() as file:
+                if scenario_path.suffix == ".json":
+                    scenario_data = json.load(file)
+                elif scenario_path.suffix in {".yaml", ".yml"}:
+                    scenario_data = yaml.safe_load(file)
+                else:
+                    raise ValueError(
+                        f"Unsupported scenario file format: {scenario_path.suffix}"
+                    )
+            if "args" in scenario_data:
+                # loading from a report file
+                scenario_data = scenario_data["args"]
+            constructor_kwargs.update(scenario_data)
+
+        # Apply overrides from kwargs
+        constructor_kwargs.update(kwargs)
+
+        return cls.model_validate(constructor_kwargs)
+
+    @classmethod
+    def get_default(cls: type[BenchmarkGenerativeTextArgs], field: str) -> Any:
+        """
+        Retrieve default value for a model field.
+
+        Extracts the default value from field metadata, handling both static defaults
+        and factory functions.
+
+        :param field: Field name to retrieve default value for
+        :return: Default value for the field
+        :raises ValueError: If field does not exist
+        """
+        if field not in cls.model_fields:
+            raise ValueError(f"Field '{field}' not found in {cls.__name__}")
+
+        field_info = cls.model_fields[field]
+        factory = field_info.default_factory
+
+        if factory is None:
+            return field_info.default
+
+        if len(inspect.signature(factory).parameters) == 0:
+            return factory()  # type: ignore[call-arg]
+        else:
+            return factory({})  # type: ignore[call-arg]
+
+    model_config = ConfigDict(
+        extra="ignore",
+        use_enum_values=True,
+        from_attributes=True,
+        arbitrary_types_allowed=True,
+        validate_by_alias=True,
+        validate_by_name=True,
+        alias_generator=AliasGenerator(
+            # Support field names with hyphens
+            validation_alias=lambda field_name: AliasChoices(
+                field_name, field_name.replace("_", "-")
+            ),
+        ),
+    )
+
+    # Required
+    target: str = Field(description="Target endpoint URL for benchmark execution")
+    data: list[Any] = Field(
+        description="List of dataset sources or data files",
+        default_factory=list,
+        min_length=1,
+    )
+    # Benchmark configuration
+    profile: StrategyType | ProfileType | Profile = Field(
+        default="sweep", description="Benchmark profile or scheduling strategy type"
+    )
+    rate: list[float] | None = Field(
+        default=None, description="Request rate(s) for rate-based scheduling"
+    )
+    # Backend configuration
+    backend: BackendType | Backend = Field(
+        default="openai_http", description="Backend type or instance for execution"
+    )
+    backend_kwargs: dict[str, Any] | None = Field(
+        default=None, description="Additional backend configuration arguments"
+    )
+    model: str | None = Field(default=None, description="Model identifier for backend")
+    # Data configuration
+    processor: str | Path | PreTrainedTokenizerBase | None = Field(
+        default=None, description="Tokenizer path, name, or instance for processing"
+    )
+    processor_args: dict[str, Any] | None = Field(
+        default=None, description="Additional tokenizer configuration arguments"
+    )
+    data_args: list[dict[str, Any]] | None = Field(
+        default_factory=list,  # type: ignore[arg-type]
+        description="Per-dataset configuration arguments",
+    )
+    data_samples: int = Field(
+        default=-1, description="Number of samples to use from datasets (-1 for all)"
+    )
+    data_column_mapper: (
+        DatasetPreprocessor
+        | dict[str, str | list[str]]
+        | Literal["generative_column_mapper"]
+    ) = Field(
+        default="generative_column_mapper",
+        description="Column mapping preprocessor for dataset fields",
+    )
+    data_request_formatter: RequestFormatter | dict[str, str] | str = Field(
+        default="chat_completions",
+        description="Request formatting preprocessor or template name",
+        validation_alias=AliasChoices(
+            "data_request_formatter",
+            "data-request-formatter",
+            "request_type",
+            "request-type",
+        ),
+    )
+    data_collator: Callable | Literal["generative"] | None = Field(
+        default="generative", description="Data collator for batch processing"
+    )
+    data_sampler: Sampler[int] | Literal["shuffle"] | None = Field(
+        default=None, description="Data sampler for request ordering"
+    )
+    data_num_workers: int | None = Field(
+        default=None, description="Number of workers for data loading"
+    )
+    dataloader_kwargs: dict[str, Any] | None = Field(
+        default=None, description="Additional dataloader configuration arguments"
+    )
+    random_seed: int = Field(default=42, description="Random seed for reproducibility")
+    # Output configuration
+    output_path: str | Path | None = Field(
+        default_factory=Path.cwd, description="Directory path for output files"
+    )
+    output_formats: list[str] | dict[str, str | dict[str, Any]] | None = Field(
+        default_factory=lambda: ["console", "json", "csv"],
+        description="Output format names or configuration mappings",
+    )
+    # Benchmarker configuration
+    sample_requests: int | None = Field(
+        default=10,
+        description="Number of requests to sample for detailed metrics (None for all)",
+    )
+    warmup: float | None = Field(
+        default=None,
+        description="Warmup period in seconds, requests, or fraction (0-1)",
+    )
+    cooldown: float | None = Field(
+        default=None,
+        description="Cooldown period in seconds, requests, or fraction (0-1)",
+    )
+    prefer_response_metrics: bool = Field(
+        default=True,
+        description="Whether to prefer backend response metrics over request metrics",
+    )
+    # Constraints configuration
+    max_seconds: int | float | None = Field(
+        default=None, description="Maximum benchmark execution time in seconds"
+    )
+    max_requests: int | None = Field(
+        default=None, description="Maximum number of requests to execute"
+    )
+    max_errors: int | None = Field(
+        default=None, description="Maximum number of errors before stopping"
+    )
+    max_error_rate: float | None = Field(
+        default=None, description="Maximum error rate (0-1) before stopping"
+    )
+    max_global_error_rate: float | None = Field(
+        default=None, description="Maximum global error rate (0-1) before stopping"
+    )
+
+    @field_validator("data", "data_args", "rate", mode="wrap")
+    @classmethod
+    def single_to_list(
+        cls, value: Any, handler: ValidatorFunctionWrapHandler
+    ) -> list[Any]:
+        """
+        Ensures field is always a list.
+
+        :param value: Input value for the 'data' field
+        :return: List of data sources
+        """
+        try:
+            return handler(value)
+        except ValidationError as err:
+            # If validation fails, try wrapping the value in a list
+            if err.errors()[0]["type"] == "list_type":
+                return handler([value])
+            else:
+                raise
+
+    @model_serializer
+    def serialize_model(self) -> dict[str, Any]:
+        """
+        Convert model to serializable dictionary format.
+
+        Transforms complex types (Backend, Profile, Path, etc.) to JSON-compatible
+        primitives while preserving configuration semantics for storage and
+        reproduction.
+
+        :return: Dictionary representation for JSON/YAML serialization
+        """
+        return {
+            # target - serialize as is
+            "target": self.target,
+            "data": [
+                item if isinstance(item, str | type(None)) else str(item)
+                for item in self.data
+            ],  # data - for each item in the list, if not a str or None, save str(item)
+            "profile": (
+                self.profile.type_
+                if isinstance(self.profile, Profile)
+                else self.profile
+            ),  # profile - if instance of Profile, then save as profile.type_
+            "rate": self.rate,
+            "backend": (
+                self.backend.type_
+                if isinstance(self.backend, Backend)
+                else self.backend
+            ),  # backend - if instance of Backend, then save as backend.type_
+            "backend_kwargs": self.backend_kwargs,
+            "model": self.model,
+            "processor": (
+                self.processor
+                if isinstance(self.processor, str)
+                else str(self.processor)
+                if self.processor is not None
+                else None
+            ),  # processor - if not str, then save as str(processor)
+            "processor_args": self.processor_args,
+            "data_args": self.data_args,
+            "data_samples": self.data_samples,
+            "data_column_mapper": (
+                self.data_column_mapper
+                if isinstance(self.data_column_mapper, dict | str)
+                else {}
+            ),  # data_column_mapper - if not dict or str, then save as an empty dict
+            "data_request_formatter": (
+                self.data_request_formatter
+                if isinstance(self.data_request_formatter, dict | str)
+                else {}
+            ),  # data_request_formatter - if not dict or str, then save as empty dict
+            "data_collator": (
+                self.data_collator if isinstance(self.data_collator, str) else None
+            ),  # data_collator - if not str, then save as None
+            "data_sampler": (
+                self.data_sampler if isinstance(self.data_sampler, str) else None
+            ),  # data_sampler - if not str, then save as None
+            "data_num_workers": self.data_num_workers,
+            "dataloader_kwargs": self.dataloader_kwargs,
+            "random_seed": self.random_seed,
+            "output_path": (
+                str(self.output_path) if self.output_path is not None else None
+            ),  # output_path - if not None, then ensure it's a str
+            "output_formats": self.output_formats,
+            "sample_requests": self.sample_requests,
+            "warmup": self.warmup,
+            "cooldown": self.cooldown,
+            "prefer_response_metrics": self.prefer_response_metrics,
+            "max_seconds": self.max_seconds,
+            "max_requests": self.max_requests,
+            "max_errors": self.max_errors,
+            "max_error_rate": self.max_error_rate,
+            "max_global_error_rate": self.max_global_error_rate,
+        }
diff --git a/src/guidellm/benchmark/schemas/generative/metrics.py b/src/guidellm/benchmark/schemas/generative/metrics.py
new file mode 100644
index 00000000..82f44f37
--- /dev/null
+++ b/src/guidellm/benchmark/schemas/generative/metrics.py
@@ -0,0 +1,931 @@
+"""
+Metrics schemas for generative AI benchmark results and performance analysis.
+
+This module defines comprehensive metric structures for tracking and analyzing
+generative AI benchmark performance across multiple dimensions including request
+statistics, token metrics, and domain-specific measurements for text, image, video,
+and audio generation. It provides statistical summaries with distribution analysis
+across successful, incomplete, and errored requests, along with scheduler-level
+performance metrics for request processing and queueing behavior.
+"""
+
+from __future__ import annotations
+
+from typing import Literal
+
+from pydantic import Field
+
+from guidellm.benchmark.schemas.generative.accumulator import (
+    GenerativeBenchmarkAccumulator,
+)
+from guidellm.scheduler import SchedulerState
+from guidellm.schemas import (
+    GenerativeRequestStats,
+    StandardBaseDict,
+    StatusBreakdown,
+    StatusDistributionSummary,
+)
+
+__all__ = [
+    "GenerativeAudioMetricsSummary",
+    "GenerativeImageMetricsSummary",
+    "GenerativeMetrics",
+    "GenerativeMetricsSummary",
+    "GenerativeTextMetricsSummary",
+    "GenerativeVideoMetricsSummary",
+    "SchedulerMetrics",
+    "StatusTypes",
+    "TimedMetricTypeAlias",
+]
+
+
+TimedMetricTypeAlias = (
+    tuple[float, float, int | float | None, int | float | None] | None
+)
+"""Timed metric tuple containing start_time, end_time, input_value, and output_value."""
+
+StatusTypes = Literal["successful", "incomplete", "errored"]
+"""Request status category for metric compilation."""
+
+# Constants for tuple indexing
+_TIMED_METRIC_START_TIME_INDEX = 0
+_TIMED_METRIC_END_TIME_INDEX = 1
+_TIMED_METRIC_INPUT_VALUE_INDEX = 2
+_TIMED_METRIC_OUTPUT_VALUE_INDEX = 3
+
+
+class SchedulerMetrics(StandardBaseDict):
+    """
+    Scheduler timing and performance statistics.
+
+    Tracks overall benchmark timing, request counts by status, and detailed internal
+    scheduler performance metrics including queue times, processing delays, and
+    request execution statistics. Used to analyze scheduler efficiency and identify
+    bottlenecks in request processing pipelines.
+    """
+
+    # Overall timings for the scheduler
+    start_time: float = Field(
+        description="Unix timestamp when the benchmark run started"
+    )
+    request_start_time: float = Field(
+        description="Unix timestamp when first request was made"
+    )
+    measure_start_time: float = Field(
+        description="Unix timestamp when measurement period started"
+    )
+    measure_end_time: float = Field(
+        description="Unix timestamp when measurement period ended"
+    )
+    request_end_time: float = Field(
+        description="Unix timestamp when last request completed"
+    )
+    end_time: float = Field(description="Unix timestamp when the benchmark run ended")
+
+    # Request details tracked by the scheduler
+    requests_made: StatusBreakdown[int, int, int, int] = Field(
+        description="Request counts by status: successful, incomplete, errored, total"
+    )
+
+    # Scheduler internal performance timings
+    queued_time_avg: float = Field(
+        description="Avg time requests spent in the queue (seconds)"
+    )
+    resolve_start_delay_avg: float = Field(
+        description="Avg delay before worker begins resolving req after dequeue (sec)"
+    )
+    resolve_targeted_start_delay_avg: float = Field(
+        description="Avg delay to targeted resolve start time (seconds)"
+    )
+    request_start_delay_avg: float = Field(
+        description="Avg delay before request starts after resolve (seconds)"
+    )
+    request_targeted_start_delay_avg: float = Field(
+        description="Avg delay to targeted request start time (seconds)"
+    )
+    request_time_avg: float = Field(description="Avg request execution time (seconds)")
+    resolve_end_delay_avg: float = Field(
+        description="Avg delay after request completes before resolve ends (seconds)"
+    )
+    resolve_time_avg: float = Field(
+        description="Avg total resolve time including request (seconds)"
+    )
+    finalized_delay_avg: float = Field(
+        description="Avg delay from resolve end to request finalization (seconds)"
+    )
+    processed_delay_avg: float = Field(
+        description="Avg delay from finalization to processing completion (seconds)"
+    )
+
+    @classmethod
+    def compile(
+        cls,
+        accumulator: GenerativeBenchmarkAccumulator,
+        scheduler_state: SchedulerState,
+    ) -> SchedulerMetrics:
+        """
+        Compile scheduler metrics from accumulator and scheduler state.
+
+        :param accumulator: Benchmark accumulator containing timing and metric data
+        :param scheduler_state: Scheduler state with execution timing information
+        :return: Compiled scheduler metrics with performance statistics
+        """
+        return SchedulerMetrics(
+            # Overall timings for the scheduler
+            start_time=scheduler_state.start_time,
+            request_start_time=accumulator.timings.request_start or -1.0,
+            measure_start_time=accumulator.timings.measure_start or -1.0,
+            measure_end_time=(
+                accumulator.timings.measure_end
+                or accumulator.timings.request_end
+                or -1.0
+            ),  # if no cooldown, measure_end isn't set, use request_end
+            request_end_time=accumulator.timings.request_end or -1.0,
+            end_time=scheduler_state.end_time or -1.0,
+            # Request details tracked by the scheduler
+            requests_made=accumulator.scheduler_metrics.requests_made,
+            # Scheduler internal performance timings
+            queued_time_avg=accumulator.scheduler_metrics.queued_time.mean or -1.0,
+            resolve_start_delay_avg=(
+                accumulator.scheduler_metrics.resolve_start_delay.mean or -1.0
+            ),
+            resolve_targeted_start_delay_avg=(
+                accumulator.scheduler_metrics.resolve_targeted_start_delay.mean or -1.0
+            ),
+            request_start_delay_avg=(
+                accumulator.scheduler_metrics.request_start_delay.mean or -1.0
+            ),
+            request_targeted_start_delay_avg=(
+                accumulator.scheduler_metrics.request_targeted_start_delay.mean or -1.0
+            ),
+            request_time_avg=accumulator.scheduler_metrics.request_time.mean or -1.0,
+            resolve_end_delay_avg=(
+                accumulator.scheduler_metrics.resolve_end_delay.mean or -1.0
+            ),
+            resolve_time_avg=accumulator.scheduler_metrics.resolve_time.mean or -1.0,
+            finalized_delay_avg=(
+                accumulator.scheduler_metrics.finalized_delay.mean or -1.0
+            ),
+            processed_delay_avg=(
+                accumulator.scheduler_metrics.processed_delay.mean or -1.0
+            ),
+        )
+
+
+class GenerativeMetricsSummary(StandardBaseDict):
+    """
+    Statistical summaries for input, output, and total metrics.
+
+    Provides distribution summaries across successful, incomplete, and errored
+    requests for absolute values, per-second rates, and concurrency levels.
+    """
+
+    input: StatusDistributionSummary | None = Field(
+        description="Distribution of input metric values"
+    )
+    input_per_second: StatusDistributionSummary | None = Field(
+        description="Distribution of input metric rates per second"
+    )
+    input_concurrency: StatusDistributionSummary | None = Field(
+        description="Distribution of concurrent input metric values"
+    )
+
+    output: StatusDistributionSummary | None = Field(
+        description="Distribution of output metric values"
+    )
+    output_per_second: StatusDistributionSummary | None = Field(
+        description="Distribution of output metric rates per second"
+    )
+    output_concurrency: StatusDistributionSummary | None = Field(
+        description="Distribution of concurrent output metric values"
+    )
+
+    total: StatusDistributionSummary | None = Field(
+        description="Distribution of total metric values (input + output)"
+    )
+    total_per_second: StatusDistributionSummary | None = Field(
+        description="Distribution of total metric rates per second"
+    )
+    total_concurrency: StatusDistributionSummary | None = Field(
+        description="Distribution of concurrent total metric values"
+    )
+
+    @classmethod
+    def compile(
+        cls,
+        property_name: str,
+        successful: list[GenerativeRequestStats],
+        incomplete: list[GenerativeRequestStats],
+        errored: list[GenerativeRequestStats],
+    ) -> GenerativeMetricsSummary | None:
+        """
+        Compile metrics summary from request statistics for a specific property.
+
+        :param property_name: Name of the property to extract from request metrics
+        :param successful: Successfully completed request statistics
+        :param incomplete: Incomplete or cancelled request statistics
+        :param errored: Failed request statistics
+        :return: Compiled metrics summary or None if no data available
+        """
+        successful_metrics = cls.extract_property_metrics_for_summary(
+            successful, property_name
+        )
+        incomplete_metrics = cls.extract_property_metrics_for_summary(
+            incomplete, property_name
+        )
+        errored_metrics = cls.extract_property_metrics_for_summary(
+            errored, property_name
+        )
+
+        return cls.compile_timed_metrics(
+            successful=successful_metrics,
+            incomplete=incomplete_metrics,
+            errored=errored_metrics,
+        )
+
+    @classmethod
+    def compile_timed_metrics(
+        cls,
+        successful: list[TimedMetricTypeAlias],
+        incomplete: list[TimedMetricTypeAlias],
+        errored: list[TimedMetricTypeAlias],
+    ) -> GenerativeMetricsSummary | None:
+        """
+        Compile metrics summary from timed metric tuples.
+
+        :param successful: Timed metrics from successful requests
+        :param incomplete: Timed metrics from incomplete requests
+        :param errored: Timed metrics from errored requests
+        :return: Compiled metrics summary or None if no data available
+        """
+
+        def _compile_metric_distributions(
+            metrics_by_status: dict[StatusTypes, list[TimedMetricTypeAlias]],
+            value_index: int,
+        ) -> tuple[
+            StatusDistributionSummary | None,
+            StatusDistributionSummary | None,
+            StatusDistributionSummary | None,
+            dict[StatusTypes, list[float]],
+            dict[StatusTypes, list[tuple[float, float]]],
+            dict[StatusTypes, list[tuple[float, float, float]]],
+        ]:
+            """Helper to compile value, rate, and concurrency distributions."""
+            value_lists: dict[StatusTypes, list[float]] = {
+                status: [
+                    float(metric[value_index] or 0.0)
+                    for metric in metrics
+                    if metric is not None
+                ]
+                for status, metrics in metrics_by_status.items()
+            }
+            value_dist = StatusDistributionSummary.from_values(
+                successful=value_lists["successful"],
+                incomplete=value_lists["incomplete"],
+                errored=value_lists["errored"],
+            )
+
+            if value_dist.total_sum == 0.0:
+                return None, None, None, value_lists, {}, {}
+
+            rate_lists: dict[StatusTypes, list[tuple[float, float]]] = {
+                status: [
+                    (  # type: ignore[misc]
+                        metric[_TIMED_METRIC_END_TIME_INDEX],
+                        float(metric[value_index] or 0.0),
+                    )
+                    for metric in metrics
+                    if metric is not None
+                ]
+                for status, metrics in metrics_by_status.items()
+            }
+            rate_dist = StatusDistributionSummary.rate_distribution_from_timings(
+                successful=rate_lists["successful"],
+                incomplete=rate_lists["incomplete"],
+                errored=rate_lists["errored"],
+            )
+
+            concurrency_lists: dict[StatusTypes, list[tuple[float, float, float]]] = {
+                status: [
+                    (  # type: ignore[misc]
+                        metric[_TIMED_METRIC_START_TIME_INDEX],
+                        metric[_TIMED_METRIC_END_TIME_INDEX],
+                        float(metric[value_index] or 0.0),
+                    )
+                    for metric in metrics
+                    if metric is not None
+                ]
+                for status, metrics in metrics_by_status.items()
+            }
+            concurrency_dist = (
+                StatusDistributionSummary.concurrency_distribution_from_timings(
+                    successful=concurrency_lists["successful"],
+                    incomplete=concurrency_lists["incomplete"],
+                    errored=concurrency_lists["errored"],
+                )
+            )
+
+            return (
+                value_dist,
+                rate_dist,
+                concurrency_dist,
+                value_lists,
+                rate_lists,
+                concurrency_lists,
+            )
+
+        metrics_by_status: dict[StatusTypes, list[TimedMetricTypeAlias]] = {
+            "successful": successful,
+            "incomplete": incomplete,
+            "errored": errored,
+        }
+
+        # Calculate input distributions
+        (
+            input_value_dist,
+            input_rate_dist,
+            input_concurrency_dist,
+            input_value_lists,
+            input_rate_lists,
+            input_concurrency_lists,
+        ) = _compile_metric_distributions(
+            metrics_by_status, _TIMED_METRIC_INPUT_VALUE_INDEX
+        )
+
+        # Calculate output distributions
+        (
+            output_value_dist,
+            output_rate_dist,
+            output_concurrency_dist,
+            output_value_lists,
+            output_rate_lists,
+            output_concurrency_lists,
+        ) = _compile_metric_distributions(
+            metrics_by_status, _TIMED_METRIC_OUTPUT_VALUE_INDEX
+        )
+
+        # Calculate total distributions if both input and output have data
+        if input_value_dist is not None and output_value_dist is not None:
+            total_value_dist = StatusDistributionSummary.from_values(
+                successful=(
+                    input_value_lists["successful"] + output_value_lists["successful"]
+                ),
+                incomplete=(
+                    input_value_lists["incomplete"] + output_value_lists["incomplete"]
+                ),
+                errored=input_value_lists["errored"] + output_value_lists["errored"],
+            )
+            total_rate_dist = StatusDistributionSummary.rate_distribution_from_timings(
+                successful=(
+                    input_rate_lists["successful"] + output_rate_lists["successful"]
+                ),
+                incomplete=(
+                    input_rate_lists["incomplete"] + output_rate_lists["incomplete"]
+                ),
+                errored=input_rate_lists["errored"] + output_rate_lists["errored"],
+            )
+            total_concurrency_dist = (
+                StatusDistributionSummary.concurrency_distribution_from_timings(
+                    successful=(
+                        input_concurrency_lists["successful"]
+                        + output_concurrency_lists["successful"]
+                    ),
+                    incomplete=(
+                        input_concurrency_lists["incomplete"]
+                        + output_concurrency_lists["incomplete"]
+                    ),
+                    errored=(
+                        input_concurrency_lists["errored"]
+                        + output_concurrency_lists["errored"]
+                    ),
+                )
+            )
+        else:
+            total_value_dist = None
+            total_rate_dist = None
+            total_concurrency_dist = None
+
+        return GenerativeMetricsSummary(
+            input=input_value_dist,
+            input_per_second=input_rate_dist,
+            input_concurrency=input_concurrency_dist,
+            output=output_value_dist,
+            output_per_second=output_rate_dist,
+            output_concurrency=output_concurrency_dist,
+            total=total_value_dist,
+            total_per_second=total_rate_dist,
+            total_concurrency=total_concurrency_dist,
+        )
+
+    @classmethod
+    def extract_property_metrics_for_summary(
+        cls, stats_list: list[GenerativeRequestStats], property_name: str
+    ) -> list[TimedMetricTypeAlias]:
+        """
+        Extract timed metrics for a specific property from request statistics.
+
+        :param stats_list: List of request statistics to extract from
+        :param property_name: Name of the property to extract from metrics
+        :return: List of tuples containing
+            (start_time, end_time, input_value, output_value)
+        """
+        return [
+            (
+                stats.request_start_time,
+                stats.request_end_time,
+                getattr(stats.input_metrics, property_name),
+                getattr(stats.output_metrics, property_name),
+            )
+            for stats in stats_list
+            if (
+                stats.request_start_time
+                and stats.request_end_time
+                and (
+                    getattr(stats.input_metrics, property_name) is not None
+                    or getattr(stats.output_metrics, property_name) is not None
+                )
+            )
+        ]
+
+
+class GenerativeTextMetricsSummary(StandardBaseDict):
+    """
+    Text-specific metric summaries for generative benchmarks.
+
+    Tracks token, word, and character-level metrics across input, output, and
+    total usage for text generation workloads.
+    """
+
+    tokens: GenerativeMetricsSummary | None = Field(
+        description="Token count metrics and distributions"
+    )
+    words: GenerativeMetricsSummary | None = Field(
+        description="Word count metrics and distributions"
+    )
+    characters: GenerativeMetricsSummary | None = Field(
+        description="Character count metrics and distributions"
+    )
+
+    @classmethod
+    def compile(
+        cls,
+        successful: list[GenerativeRequestStats],
+        incomplete: list[GenerativeRequestStats],
+        errored: list[GenerativeRequestStats],
+    ) -> GenerativeTextMetricsSummary:
+        """
+        Compile text metrics summary from request statistics.
+
+        :param successful: Successfully completed request statistics
+        :param incomplete: Incomplete/cancelled request statistics
+        :param errored: Failed request statistics
+        :return: Compiled text metrics summary
+        """
+        return GenerativeTextMetricsSummary(
+            tokens=GenerativeMetricsSummary.compile(
+                property_name="text_tokens",
+                successful=successful,
+                incomplete=incomplete,
+                errored=errored,
+            ),
+            words=GenerativeMetricsSummary.compile(
+                property_name="text_words",
+                successful=successful,
+                incomplete=incomplete,
+                errored=errored,
+            ),
+            characters=GenerativeMetricsSummary.compile(
+                property_name="text_characters",
+                successful=successful,
+                incomplete=incomplete,
+                errored=errored,
+            ),
+        )
+
+
+class GenerativeImageMetricsSummary(StandardBaseDict):
+    """
+    Image-specific metric summaries for generative benchmarks.
+
+    Tracks token, image count, pixel, and byte-level metrics across input, output,
+    and total usage for image generation workloads.
+    """
+
+    tokens: GenerativeMetricsSummary | None = Field(
+        description="Image token count metrics and distributions"
+    )
+    images: GenerativeMetricsSummary | None = Field(
+        description="Image count metrics and distributions"
+    )
+    pixels: GenerativeMetricsSummary | None = Field(
+        description="Pixel count metrics and distributions"
+    )
+    bytes: GenerativeMetricsSummary | None = Field(
+        description="Byte size metrics and distributions"
+    )
+
+    @classmethod
+    def compile(
+        cls,
+        successful: list[GenerativeRequestStats],
+        incomplete: list[GenerativeRequestStats],
+        errored: list[GenerativeRequestStats],
+    ) -> GenerativeImageMetricsSummary:
+        """
+        Compile image metrics summary from request statistics.
+
+        :param successful: Successfully completed request statistics
+        :param incomplete: Incomplete/cancelled request statistics
+        :param errored: Failed request statistics
+        :return: Compiled image metrics summary
+        """
+        return GenerativeImageMetricsSummary(
+            tokens=GenerativeMetricsSummary.compile(
+                property_name="image_tokens",
+                successful=successful,
+                incomplete=incomplete,
+                errored=errored,
+            ),
+            images=GenerativeMetricsSummary.compile(
+                property_name="image_count",
+                successful=successful,
+                incomplete=incomplete,
+                errored=errored,
+            ),
+            pixels=GenerativeMetricsSummary.compile(
+                property_name="image_pixels",
+                successful=successful,
+                incomplete=incomplete,
+                errored=errored,
+            ),
+            bytes=GenerativeMetricsSummary.compile(
+                property_name="image_bytes",
+                successful=successful,
+                incomplete=incomplete,
+                errored=errored,
+            ),
+        )
+
+
+class GenerativeVideoMetricsSummary(StandardBaseDict):
+    """
+    Video-specific metric summaries for generative benchmarks.
+
+    Tracks token, frame count, duration, and byte-level metrics across input,
+    output, and total usage for video generation workloads.
+    """
+
+    tokens: GenerativeMetricsSummary | None = Field(
+        description="Video token count metrics and distributions"
+    )
+    frames: GenerativeMetricsSummary | None = Field(
+        description="Frame count metrics and distributions"
+    )
+    seconds: GenerativeMetricsSummary | None = Field(
+        description="Duration metrics in seconds and distributions"
+    )
+    bytes: GenerativeMetricsSummary | None = Field(
+        description="Byte size metrics and distributions"
+    )
+
+    @classmethod
+    def compile(
+        cls,
+        successful: list[GenerativeRequestStats],
+        incomplete: list[GenerativeRequestStats],
+        errored: list[GenerativeRequestStats],
+    ) -> GenerativeVideoMetricsSummary:
+        """
+        Compile video metrics summary from request statistics.
+
+        :param successful: Successfully completed request statistics
+        :param incomplete: Incomplete/cancelled request statistics
+        :param errored: Failed request statistics
+        :return: Compiled video metrics summary
+        """
+        return GenerativeVideoMetricsSummary(
+            tokens=GenerativeMetricsSummary.compile(
+                property_name="video_tokens",
+                successful=successful,
+                incomplete=incomplete,
+                errored=errored,
+            ),
+            frames=GenerativeMetricsSummary.compile(
+                property_name="video_frames",
+                successful=successful,
+                incomplete=incomplete,
+                errored=errored,
+            ),
+            seconds=GenerativeMetricsSummary.compile(
+                property_name="video_seconds",
+                successful=successful,
+                incomplete=incomplete,
+                errored=errored,
+            ),
+            bytes=GenerativeMetricsSummary.compile(
+                property_name="video_bytes",
+                successful=successful,
+                incomplete=incomplete,
+                errored=errored,
+            ),
+        )
+
+
+class GenerativeAudioMetricsSummary(StandardBaseDict):
+    """
+    Audio-specific metric summaries for generative benchmarks.
+
+    Tracks token, sample count, duration, and byte-level metrics across input,
+    output, and total usage for audio generation workloads.
+    """
+
+    tokens: GenerativeMetricsSummary | None = Field(
+        description="Audio token count metrics and distributions"
+    )
+    samples: GenerativeMetricsSummary | None = Field(
+        description="Sample count metrics and distributions"
+    )
+    seconds: GenerativeMetricsSummary | None = Field(
+        description="Duration metrics in seconds and distributions"
+    )
+    bytes: GenerativeMetricsSummary | None = Field(
+        description="Byte size metrics and distributions"
+    )
+
+    @classmethod
+    def compile(
+        cls,
+        successful: list[GenerativeRequestStats],
+        incomplete: list[GenerativeRequestStats],
+        errored: list[GenerativeRequestStats],
+    ) -> GenerativeAudioMetricsSummary:
+        """
+        Compile audio metrics summary from request statistics.
+
+        :param successful: Successfully completed request statistics
+        :param incomplete: Incomplete/cancelled request statistics
+        :param errored: Failed request statistics
+        :return: Compiled audio metrics summary
+        """
+        return GenerativeAudioMetricsSummary(
+            tokens=GenerativeMetricsSummary.compile(
+                property_name="audio_tokens",
+                successful=successful,
+                incomplete=incomplete,
+                errored=errored,
+            ),
+            samples=GenerativeMetricsSummary.compile(
+                property_name="audio_samples",
+                successful=successful,
+                incomplete=incomplete,
+                errored=errored,
+            ),
+            seconds=GenerativeMetricsSummary.compile(
+                property_name="audio_seconds",
+                successful=successful,
+                incomplete=incomplete,
+                errored=errored,
+            ),
+            bytes=GenerativeMetricsSummary.compile(
+                property_name="audio_bytes",
+                successful=successful,
+                incomplete=incomplete,
+                errored=errored,
+            ),
+        )
+
+
+class GenerativeMetrics(StandardBaseDict):
+    """
+    Comprehensive metrics for generative AI benchmarks.
+
+    Aggregates request statistics, token metrics, timing distributions, and
+    domain-specific measurements across text, image, video, and audio modalities.
+    Provides detailed statistical summaries including distribution analysis for
+    throughput, latency, concurrency, and resource utilization metrics across
+    successful, incomplete, and errored requests.
+    """
+
+    # Request stats
+    request_totals: StatusBreakdown[int, int, int, int] = Field(
+        description="Request counts by status: successful, incomplete, errored, total"
+    )
+    requests_per_second: StatusDistributionSummary = Field(
+        description="Distribution of requests per second across benchmark execution"
+    )
+    request_concurrency: StatusDistributionSummary = Field(
+        description="Distribution of concurrent request counts during execution"
+    )
+    request_latency: StatusDistributionSummary = Field(
+        description="Distribution of request latencies for completed requests"
+    )
+    request_streaming_iterations_count: StatusDistributionSummary = Field(
+        description="Distribution of stream iterations for completed requests"
+    )
+
+    # General token stats
+    prompt_token_count: StatusDistributionSummary = Field(
+        description="Distribution of prompt token counts by request status"
+    )
+    output_token_count: StatusDistributionSummary = Field(
+        description="Distribution of output token counts by request status"
+    )
+    total_token_count: StatusDistributionSummary = Field(
+        description="Distribution of total token counts by request status"
+    )
+    time_to_first_token_ms: StatusDistributionSummary = Field(
+        description="Distribution of first token latencies in milliseconds"
+    )
+    time_per_output_token_ms: StatusDistributionSummary = Field(
+        description="Distribution of average time per output token in milliseconds"
+    )
+    inter_token_latency_ms: StatusDistributionSummary = Field(
+        description="Distribution of inter-token latencies in milliseconds"
+    )
+    prompt_tokens_per_second: StatusDistributionSummary = Field(
+        description="Distribution of prompt token processing rates"
+    )
+    output_tokens_per_second: StatusDistributionSummary = Field(
+        description="Distribution of output token generation rates"
+    )
+    tokens_per_second: StatusDistributionSummary = Field(
+        description="Distribution of total token throughput including prompt and output"
+    )
+    output_tokens_per_iteration: StatusDistributionSummary = Field(
+        description="Distribution of output tokens generated per streaming iteration"
+    )
+    iter_tokens_per_iteration: StatusDistributionSummary = Field(
+        description=(
+            "Distribution of output tokens (without first) generated per "
+            "streaming iteration"
+        )
+    )
+
+    # Domain specific stats
+    text: GenerativeTextMetricsSummary = Field(
+        description="Text-specific metrics for tokens, words, and characters"
+    )
+    image: GenerativeImageMetricsSummary = Field(
+        description="Image-specific metrics for tokens, images, pixels, and bytes"
+    )
+    video: GenerativeVideoMetricsSummary = Field(
+        description="Video-specific metrics for tokens, frames, duration, and bytes"
+    )
+    audio: GenerativeAudioMetricsSummary = Field(
+        description="Audio-specific metrics for tokens, samples, duration, and bytes"
+    )
+
+    @classmethod
+    def compile(cls, accumulator: GenerativeBenchmarkAccumulator) -> GenerativeMetrics:
+        """
+        Compile comprehensive generative metrics from benchmark accumulator.
+
+        :param accumulator: Benchmark accumulator with completed request statistics
+        :return: Compiled generative metrics with all distributions and summaries
+        :raises ValueError: If measure_start and measure_end/request_end are not set
+        """
+        if (start_time := accumulator.timings.measure_start) is None or (
+            end_time := accumulator.timings.measure_end
+            or accumulator.timings.request_end
+        ) is None:
+            raise ValueError(
+                "Cannot compile GenerativeMetrics: "
+                "measure_start and measure_end/request_end must be set"
+            )
+
+        successful = accumulator.completed.get_within_range(start_time, end_time)
+        incomplete = accumulator.incomplete.get_within_range(start_time, end_time)
+        errored = accumulator.errored.get_within_range(start_time, end_time)
+
+        return GenerativeMetrics(
+            # Request stats
+            request_totals=StatusBreakdown(
+                successful=len(successful),
+                incomplete=len(incomplete),
+                errored=len(errored),
+                total=(len(successful) + len(incomplete) + len(errored)),
+            ),
+            requests_per_second=StatusDistributionSummary.rate_distribution_from_timings_function(
+                function=lambda req: req.request_end_time,
+                successful=successful,
+                incomplete=incomplete,
+                errored=errored,
+                start_time=start_time,
+                end_time=end_time,
+            ),
+            request_concurrency=StatusDistributionSummary.concurrency_distribution_from_timings_function(
+                function=(
+                    lambda req: (req.request_start_time, req.request_end_time)
+                    if req.request_start_time is not None
+                    and req.request_end_time is not None
+                    else None
+                ),
+                successful=successful,
+                incomplete=incomplete,
+                errored=errored,
+                start_time=start_time,
+                end_time=end_time,
+            ),
+            request_latency=StatusDistributionSummary.from_values_function(
+                function=lambda req: req.request_latency or 0.0,
+                successful=successful,
+                incomplete=incomplete,
+                errored=errored,
+            ),
+            request_streaming_iterations_count=StatusDistributionSummary.from_values_function(
+                function=lambda req: req.info.timings.request_iterations or 0.0,
+                successful=successful,
+                incomplete=incomplete,
+                errored=errored,
+            ),
+            # General token stats
+            prompt_token_count=StatusDistributionSummary.from_values_function(
+                function=lambda req: req.prompt_tokens or 0.0,
+                successful=successful,
+                incomplete=incomplete,
+                errored=errored,
+            ),
+            output_token_count=StatusDistributionSummary.from_values_function(
+                function=lambda req: req.output_tokens or 0.0,
+                successful=successful,
+                incomplete=incomplete,
+                errored=errored,
+            ),
+            total_token_count=StatusDistributionSummary.from_values_function(
+                function=lambda req: req.total_tokens or 0.0,
+                successful=successful,
+                incomplete=incomplete,
+                errored=errored,
+            ),
+            time_to_first_token_ms=StatusDistributionSummary.from_values_function(
+                function=lambda req: req.time_to_first_token_ms or 0.0,
+                successful=successful,
+                incomplete=incomplete,
+                errored=errored,
+            ),
+            time_per_output_token_ms=StatusDistributionSummary.from_values_function(
+                function=lambda req: (
+                    req.time_per_output_token_ms or 0.0,
+                    req.output_tokens or 0.0,
+                ),
+                successful=successful,
+                incomplete=incomplete,
+                errored=errored,
+            ),
+            inter_token_latency_ms=StatusDistributionSummary.from_values_function(
+                function=lambda req: (
+                    req.inter_token_latency_ms or 0.0,
+                    (req.output_tokens or 1.0) - 1.0,
+                ),
+                successful=successful,
+                incomplete=incomplete,
+                errored=errored,
+            ),
+            prompt_tokens_per_second=StatusDistributionSummary.rate_distribution_from_timings_function(
+                function=lambda req: req.prompt_tokens_timing,
+                successful=successful,
+                incomplete=incomplete,
+                errored=errored,
+            ),
+            output_tokens_per_second=StatusDistributionSummary.rate_distribution_from_timings_function(
+                function=lambda req: req.output_tokens_timings,
+                successful=successful,
+                incomplete=incomplete,
+                errored=errored,
+            ),
+            tokens_per_second=StatusDistributionSummary.rate_distribution_from_timings_function(
+                function=lambda req: req.total_tokens_timings,
+                successful=successful,
+                incomplete=incomplete,
+                errored=errored,
+            ),
+            output_tokens_per_iteration=StatusDistributionSummary.from_values_function(
+                function=lambda req: [
+                    tokens for (_timing, tokens) in req.output_tokens_timings
+                ],
+                successful=successful,
+                incomplete=incomplete,
+                errored=errored,
+            ),
+            iter_tokens_per_iteration=StatusDistributionSummary.from_values_function(
+                function=lambda req: [
+                    tokens for (_timing, tokens) in req.iter_tokens_timings
+                ],
+                successful=successful,
+                incomplete=incomplete,
+                errored=errored,
+            ),
+            # Domain-specific stats
+            text=GenerativeTextMetricsSummary.compile(
+                successful=successful, incomplete=incomplete, errored=errored
+            ),
+            image=GenerativeImageMetricsSummary.compile(
+                successful=successful, incomplete=incomplete, errored=errored
+            ),
+            video=GenerativeVideoMetricsSummary.compile(
+                successful=successful, incomplete=incomplete, errored=errored
+            ),
+            audio=GenerativeAudioMetricsSummary.compile(
+                successful=successful, incomplete=incomplete, errored=errored
+            ),
+        )
diff --git a/src/guidellm/benchmark/schemas/generative/report.py b/src/guidellm/benchmark/schemas/generative/report.py
new file mode 100644
index 00000000..16cc654b
--- /dev/null
+++ b/src/guidellm/benchmark/schemas/generative/report.py
@@ -0,0 +1,125 @@
+"""
+Report container for multiple generative benchmark results with persistence.
+
+Provides data structures for aggregating multiple benchmark executions into a single
+report with file I/O capabilities. Supports loading and saving benchmark collections
+in JSON and YAML formats, enabling result persistence, sharing, and analysis across
+different execution sessions. Core functionality includes benchmark grouping with
+shared configuration parameters and flexible file path resolution.
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import ClassVar, Literal
+
+import yaml
+from pydantic import Field
+
+from guidellm.benchmark.schemas.generative.benchmark import GenerativeBenchmark
+from guidellm.benchmark.schemas.generative.entrypoints import (
+    BenchmarkGenerativeTextArgs,
+)
+from guidellm.schemas import StandardBaseModel
+
+__all__ = ["GenerativeBenchmarksReport"]
+
+
+class GenerativeBenchmarksReport(StandardBaseModel):
+    """
+    Container for multiple benchmark results with load/save functionality.
+
+    Aggregates multiple generative benchmark executions into a single report,
+    providing persistence through JSON and YAML file formats. Enables result
+    collection, storage, and retrieval across different execution sessions with
+    automatic file type detection and path resolution.
+
+    :cvar DEFAULT_FILE: Default filename used when saving to or loading from a directory
+    """
+
+    DEFAULT_FILE: ClassVar[str] = "benchmarks.json"
+
+    args: BenchmarkGenerativeTextArgs = Field(
+        description="Benchmark arguments used for all benchmarks in the report"
+    )
+    benchmarks: list[GenerativeBenchmark] = Field(
+        description="List of completed benchmarks in the report",
+        default_factory=list,
+    )
+
+    def save_file(
+        self,
+        path: str | Path | None = None,
+        type_: Literal["json", "yaml"] | None = None,
+    ) -> Path:
+        """
+        Save report to file in JSON or YAML format.
+
+        :param path: File path or directory for saving, defaults to current directory
+            with DEFAULT_FILE name
+        :param type_: File format override ('json' or 'yaml'), auto-detected from
+            extension if None
+        :return: Resolved path to the saved file
+        :raises ValueError: If file type is unsupported or cannot be determined
+        """
+        file_path = GenerativeBenchmarksReport._resolve_path(
+            path if path is not None else Path.cwd()
+        )
+        file_path.parent.mkdir(parents=True, exist_ok=True)
+        file_type = type_ or file_path.suffix.lower()[1:]
+        model_dict = self.model_dump()
+
+        if file_type == "json":
+            save_str = json.dumps(model_dict)
+        elif file_type in ["yaml", "yml"]:
+            save_str = yaml.dump(model_dict)
+        else:
+            raise ValueError(f"Unsupported file type: {file_type} for {file_path}.")
+
+        with file_path.open("w") as file:
+            file.write(save_str)
+
+        return file_path
+
+    @classmethod
+    def load_file(
+        cls, path: str | Path, type_: Literal["json", "yaml"] | None = None
+    ) -> GenerativeBenchmarksReport:
+        """
+        Load report from JSON or YAML file.
+
+        :param path: File path or directory containing DEFAULT_FILE to load from
+        :param type_: File format override ('json' or 'yaml'), auto-detected from
+            extension if None
+        :return: Loaded report instance with benchmarks and configuration
+        :raises ValueError: If file type is unsupported or cannot be determined
+        :raises FileNotFoundError: If specified file does not exist
+        """
+        file_path = GenerativeBenchmarksReport._resolve_path(path)
+        file_type = type_ or file_path.suffix.lower()[1:]
+
+        with file_path.open("r") as file:
+            if file_type == "json":
+                model_dict = json.loads(file.read())
+            elif file_type in ["yaml", "yml"]:
+                model_dict = yaml.safe_load(file)
+            else:
+                raise ValueError(f"Unsupported file type: {file_type} for {file_path}.")
+
+        return GenerativeBenchmarksReport.model_validate(model_dict)
+
+    @classmethod
+    def _resolve_path(cls, path: str | Path) -> Path:
+        """
+        Resolve input to file path, converting directories to DEFAULT_FILE location.
+
+        :param path: String or Path to resolve, directories append DEFAULT_FILE
+        :return: Resolved file path
+        """
+        resolved = Path(path) if not isinstance(path, Path) else path
+
+        if resolved.is_dir():
+            resolved = resolved / GenerativeBenchmarksReport.DEFAULT_FILE
+
+        return resolved
diff --git a/src/guidellm/data/__init__.py b/src/guidellm/data/__init__.py
index 0bff1b64..9adbd3c8 100644
--- a/src/guidellm/data/__init__.py
+++ b/src/guidellm/data/__init__.py
@@ -9,6 +9,7 @@
     DataDependentPreprocessor,
     DatasetPreprocessor,
     PreprocessorRegistry,
+    RequestFormatter,
 )
 from .processor import ProcessorFactory
 from .schemas import GenerativeDatasetColumnType
@@ -25,4 +26,5 @@
     "GenerativeRequestCollator",
     "PreprocessorRegistry",
     "ProcessorFactory",
+    "RequestFormatter",
 ]
diff --git a/src/guidellm/data/deserializers/synthetic.py b/src/guidellm/data/deserializers/synthetic.py
index e1df911a..6e098462 100644
--- a/src/guidellm/data/deserializers/synthetic.py
+++ b/src/guidellm/data/deserializers/synthetic.py
@@ -17,7 +17,8 @@
     DatasetDeserializer,
     DatasetDeserializerFactory,
 )
-from guidellm.utils import IntegerRangeSampler, StandardBaseModel
+from guidellm.schemas import StandardBaseModel
+from guidellm.utils import IntegerRangeSampler
 
 __all__ = [
     "SyntheticTextDatasetConfig",
diff --git a/src/guidellm/data/loaders.py b/src/guidellm/data/loaders.py
index b4ee38da..4f96002e 100644
--- a/src/guidellm/data/loaders.py
+++ b/src/guidellm/data/loaders.py
@@ -2,7 +2,7 @@
 
 import contextlib
 from collections.abc import Callable, Iterator
-from typing import Any, Literal
+from typing import Any, Literal, TypeVar
 
 import torch
 from torch.utils.data import Sampler
@@ -17,7 +17,10 @@
 __all__ = ["DataLoader", "DatasetsIterator"]
 
 
-class DatasetsIterator(TorchIterableDataset):
+DataT = TypeVar("DataT")
+
+
+class DatasetsIterator(TorchIterableDataset[DataT]):
     def __init__(
         self,
         data: list[Any],
@@ -60,7 +63,7 @@ def __init__(
             list(self.generator(data_samples)) if data_samples else None
         )
 
-    def __iter__(self):
+    def __iter__(self) -> Iterator[DataT]:
         worker_info = torch.utils.data.get_worker_info()
         worker_modulus = worker_info.num_workers if worker_info is not None else 1
         worker_index = worker_info.id if worker_info is not None else 0
@@ -77,7 +80,7 @@ def generator(
         max_items: int | None = None,
         modulus: int | None = None,
         offset: int | None = None,
-    ) -> Iterator[Any]:
+    ) -> Iterator[DataT]:
         gen_count = 0
 
         with contextlib.suppress(StopIteration):
@@ -102,7 +105,7 @@ def generator(
                         # passed into the preprocessor, which is a type violation.
                         # This should be fixed at some point.
                         row = preprocessor(row)  # type: ignore[assignment]
-                    yield row
+                    yield row  # type: ignore[misc]
                 except Exception as err:  # noqa: BLE001 # Exception logged
                     logger.error(f"Skipping data row due to error: {err}")
                     gen_count -= 1
@@ -114,7 +117,7 @@ def generator(
             )
 
 
-class DataLoader(PyTorchDataLoader):
+class DataLoader(PyTorchDataLoader[DataT]):
     def __init__(
         self,
         data: list[Any],
@@ -128,7 +131,7 @@ def __init__(
         random_seed: int = 42,
         **kwargs: Any,
     ):
-        iterator = DatasetsIterator(
+        iterator: DatasetsIterator[DataT] = DatasetsIterator(
             data=data,
             data_args=data_args,
             data_samples=data_samples,
diff --git a/src/guidellm/data/preprocessors/__init__.py b/src/guidellm/data/preprocessors/__init__.py
index 664e196b..6d6e722d 100644
--- a/src/guidellm/data/preprocessors/__init__.py
+++ b/src/guidellm/data/preprocessors/__init__.py
@@ -3,6 +3,7 @@
     GenerativeAudioTranslationRequestFormatter,
     GenerativeChatCompletionsRequestFormatter,
     GenerativeTextCompletionsRequestFormatter,
+    RequestFormatter,
 )
 from .mappers import GenerativeColumnMapper
 from .preprocessor import (
@@ -22,4 +23,5 @@
     "GenerativeColumnMapper",
     "GenerativeTextCompletionsRequestFormatter",
     "PreprocessorRegistry",
+    "RequestFormatter",
 ]
diff --git a/src/guidellm/data/preprocessors/formatters.py b/src/guidellm/data/preprocessors/formatters.py
index 5a869403..608128a6 100644
--- a/src/guidellm/data/preprocessors/formatters.py
+++ b/src/guidellm/data/preprocessors/formatters.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-from abc import ABCMeta
 from typing import Any
 
 from guidellm.data.preprocessors.preprocessor import (
@@ -14,10 +13,14 @@
     "GenerativeAudioTranslationRequestFormatter",
     "GenerativeChatCompletionsRequestFormatter",
     "GenerativeTextCompletionsRequestFormatter",
+    "RequestFormatter",
 ]
 
 
-class RequestFormatter(DatasetPreprocessor, metaclass=ABCMeta):
+class RequestFormatter(DatasetPreprocessor):
+    def __init__(self, model: str, **_kwargs):
+        self.model = model
+
     @staticmethod
     def encode_audio(*args, **kwargs):
         from guidellm.extras.audio import encode_audio
@@ -47,7 +50,7 @@ def __init__(
         max_tokens: int | None = None,
         max_completion_tokens: int | None = None,
     ):
-        self.model: str | None = model
+        self.model: str = model
         self.extras = (
             GenerationRequestArguments(**extras)
             if extras and isinstance(extras, dict)
@@ -73,6 +76,7 @@ def __call__(self, columns: dict[str, list[Any]]) -> GenerationRequest:
         if self.stream:
             arguments.stream = True
             arguments.body["stream"] = True
+            arguments.body["stream_options"] = {"include_usage": True}
 
         # Handle output tokens
         if output_tokens := sum(
@@ -158,9 +162,8 @@ def __call__(  # noqa: C901, PLR0912, PLR0915
         # Configure streaming
         if self.stream:
             arguments.stream = True
-            arguments.body.update(
-                {"stream": True, "stream_options": {"include_usage": True}}
-            )
+            arguments.body["stream"] = True
+            arguments.body["stream_options"] = {"include_usage": True}
 
         # Handle output tokens
         if output_tokens := sum(
@@ -334,6 +337,7 @@ def __call__(  # noqa: C901
         if self.stream:
             arguments.stream = True
             arguments.body["stream"] = True
+            arguments.body["stream_options"] = {"include_usage": True}
 
         # Handle output tokens
         if output_tokens := sum(
diff --git a/src/guidellm/data/preprocessors/preprocessor.py b/src/guidellm/data/preprocessors/preprocessor.py
index e95ad75d..43fe20e9 100644
--- a/src/guidellm/data/preprocessors/preprocessor.py
+++ b/src/guidellm/data/preprocessors/preprocessor.py
@@ -25,6 +25,6 @@ def setup_data(
 
 
 class PreprocessorRegistry(
-    RegistryMixin[DataDependentPreprocessor | type[DataDependentPreprocessor]]
+    RegistryMixin[type[DatasetPreprocessor] | type[DataDependentPreprocessor]]
 ):
     pass
diff --git a/src/guidellm/data/processor.py b/src/guidellm/data/processor.py
index 7962bfbf..e55eb123 100644
--- a/src/guidellm/data/processor.py
+++ b/src/guidellm/data/processor.py
@@ -1,11 +1,9 @@
 from __future__ import annotations
 
+from pathlib import Path
 from typing import Any
 
-from transformers import (  # type: ignore[import]
-    AutoTokenizer,
-    PreTrainedTokenizerBase,
-)
+from transformers import AutoTokenizer, PreTrainedTokenizerBase  # type: ignore[import]
 
 __all__ = ["ProcessorFactory"]
 
@@ -13,7 +11,7 @@
 class ProcessorFactory:
     def __init__(
         self,
-        processor: str | PreTrainedTokenizerBase,
+        processor: str | Path | PreTrainedTokenizerBase,
         processor_args: dict[str, Any] | None = None,
     ) -> None:
         self.processor = processor
diff --git a/src/guidellm/mock_server/server.py b/src/guidellm/mock_server/server.py
index ff9d5fcd..e85c6134 100644
--- a/src/guidellm/mock_server/server.py
+++ b/src/guidellm/mock_server/server.py
@@ -11,12 +11,13 @@
 from __future__ import annotations
 
 import time
+from typing import Any
 
 from sanic import Sanic, response
 from sanic.exceptions import NotFound
 from sanic.log import logger
 from sanic.request import Request
-from sanic.response import HTTPResponse
+from sanic.response import BaseHTTPResponse, HTTPResponse
 
 from guidellm.mock_server.config import MockServerConfig
 from guidellm.mock_server.handlers import (
@@ -65,16 +66,20 @@ def _setup_middleware(self):
         """Setup middleware for CORS, logging, etc."""
 
         @self.app.middleware("request")
-        async def add_cors_headers(_request: Request):
+        async def add_cors_headers(_request: Request) -> None:
             """Add CORS headers to all requests."""
+            return None  # noqa: RET501
 
         @self.app.middleware("response")
-        async def add_response_headers(_request: Request, resp: HTTPResponse):
+        async def add_response_headers(
+            _request: Any, resp: BaseHTTPResponse
+        ) -> HTTPResponse:
             """Add standard response headers."""
             resp.headers["Access-Control-Allow-Origin"] = "*"
             resp.headers["Access-Control-Allow-Methods"] = "GET, POST, OPTIONS"
             resp.headers["Access-Control-Allow-Headers"] = "Content-Type, Authorization"
             resp.headers["Server"] = "guidellm-mock-server"
+            return resp  # type: ignore[return-value]
 
     def _setup_routes(self):  # noqa: C901
         @self.app.get("/health")
diff --git a/src/guidellm/preprocess/dataset.py b/src/guidellm/preprocess/dataset.py
index 49ce7b09..033bf106 100644
--- a/src/guidellm/preprocess/dataset.py
+++ b/src/guidellm/preprocess/dataset.py
@@ -276,8 +276,8 @@ def process_dataset(
         processor_args,
         "dataset conversion.",
     )
-    prompt_column = column_mappings.get("prompt_column")
-    output_column = column_mappings.get(
+    prompt_column = column_mappings.get("prompt_column")  # type: ignore[attr-defined]
+    output_column = column_mappings.get(  # type: ignore[attr-defined]
         "output_tokens_count_column", "output_tokens_count"
     )
 
@@ -304,7 +304,7 @@ def process_dataset(
         )
     )
 
-    dataset_iterator = iter(dataset)
+    dataset_iterator = iter(dataset)  # type: ignore[call-overload]
     processed_prompts = []
     prompt_handler = STRATEGY_HANDLERS[short_prompt_strategy]
 
diff --git a/src/guidellm/presentation/__init__.py b/src/guidellm/presentation/__init__.py
deleted file mode 100644
index 872188db..00000000
--- a/src/guidellm/presentation/__init__.py
+++ /dev/null
@@ -1,28 +0,0 @@
-from .builder import UIDataBuilder
-from .data_models import (
-    BenchmarkDatum,
-    Bucket,
-    Dataset,
-    Distribution,
-    Model,
-    RunInfo,
-    Server,
-    TokenDetails,
-    WorkloadDetails,
-)
-from .injector import create_report, inject_data
-
-__all__ = [
-    "BenchmarkDatum",
-    "Bucket",
-    "Dataset",
-    "Distribution",
-    "Model",
-    "RunInfo",
-    "Server",
-    "TokenDetails",
-    "UIDataBuilder",
-    "WorkloadDetails",
-    "create_report",
-    "inject_data",
-]
diff --git a/src/guidellm/presentation/builder.py b/src/guidellm/presentation/builder.py
deleted file mode 100644
index 6ea9c5c3..00000000
--- a/src/guidellm/presentation/builder.py
+++ /dev/null
@@ -1,27 +0,0 @@
-from typing import TYPE_CHECKING, Any
-
-if TYPE_CHECKING:
-    from guidellm.benchmark import GenerativeBenchmark
-
-from guidellm.presentation.data_models import BenchmarkDatum, RunInfo, WorkloadDetails
-
-
-class UIDataBuilder:
-    def __init__(self, benchmarks: list["GenerativeBenchmark"]):
-        self.benchmarks = benchmarks
-
-    def build_run_info(self):
-        return RunInfo.from_benchmarks(self.benchmarks)
-
-    def build_workload_details(self):
-        return WorkloadDetails.from_benchmarks(self.benchmarks)
-
-    def build_benchmarks(self):
-        return [BenchmarkDatum.from_benchmark(b) for b in self.benchmarks]
-
-    def to_dict(self) -> dict[str, Any]:
-        return {
-            "run_info": self.build_run_info().model_dump(),
-            "workload_details": self.build_workload_details().model_dump(),
-            "benchmarks": [b.model_dump() for b in self.build_benchmarks()],
-        }
diff --git a/src/guidellm/presentation/data_models.py b/src/guidellm/presentation/data_models.py
deleted file mode 100644
index deec925c..00000000
--- a/src/guidellm/presentation/data_models.py
+++ /dev/null
@@ -1,236 +0,0 @@
-import random
-from collections import defaultdict
-from math import ceil
-from typing import TYPE_CHECKING
-
-from pydantic import BaseModel, computed_field
-
-if TYPE_CHECKING:
-    from guidellm.benchmark import GenerativeBenchmark
-
-from guidellm.utils import DistributionSummary
-
-
-class Bucket(BaseModel):
-    value: float | int
-    count: int
-
-    @staticmethod
-    def from_data(
-        data: list[float] | list[int],
-        bucket_width: float | None = None,
-        n_buckets: int | None = None,
-    ) -> tuple[list["Bucket"], float]:
-        if not data:
-            return [], 1.0
-
-        min_v = min(data)
-        max_v = max(data)
-        range_v = (1 + max_v) - min_v
-
-        if bucket_width is None:
-            if n_buckets is None:
-                n_buckets = 10
-            bucket_width = range_v / n_buckets
-        else:
-            n_buckets = ceil(range_v / bucket_width)
-
-        bucket_counts: defaultdict[float | int, int] = defaultdict(int)
-        for val in data:
-            idx = int((val - min_v) // bucket_width)
-            if idx >= n_buckets:
-                idx = n_buckets - 1
-            bucket_start = min_v + idx * bucket_width
-            bucket_counts[bucket_start] += 1
-
-        buckets = [
-            Bucket(value=start, count=count)
-            for start, count in sorted(bucket_counts.items())
-        ]
-        return buckets, bucket_width
-
-
-class Model(BaseModel):
-    name: str
-    size: int
-
-
-class Dataset(BaseModel):
-    name: str
-
-
-class RunInfo(BaseModel):
-    model: Model
-    task: str
-    timestamp: float
-    dataset: Dataset
-
-    @classmethod
-    def from_benchmarks(cls, benchmarks: list["GenerativeBenchmark"]):
-        model = benchmarks[0].benchmarker.backend.get("model", "N/A")
-        timestamp = max(
-            bm.run_stats.start_time for bm in benchmarks if bm.start_time is not None
-        )
-        return cls(
-            model=Model(name=model or "", size=0),
-            task="N/A",
-            timestamp=timestamp,
-            dataset=Dataset(name="N/A"),
-        )
-
-
-class Distribution(BaseModel):
-    statistics: DistributionSummary | None = None
-    buckets: list[Bucket]
-    bucket_width: float
-
-
-class TokenDetails(BaseModel):
-    samples: list[str]
-    token_distributions: Distribution
-
-
-class Server(BaseModel):
-    target: str
-
-
-class RequestOverTime(BaseModel):
-    num_benchmarks: int
-    requests_over_time: Distribution
-
-
-class WorkloadDetails(BaseModel):
-    prompts: TokenDetails
-    generations: TokenDetails
-    requests_over_time: RequestOverTime
-    rate_type: str
-    server: Server
-
-    @classmethod
-    def from_benchmarks(cls, benchmarks: list["GenerativeBenchmark"]):
-        target = benchmarks[0].benchmarker.backend.get("target", "N/A")
-        rate_type = benchmarks[0].scheduler.strategy.type_
-        successful_requests = [
-            req for bm in benchmarks for req in bm.requests.successful
-        ]
-        sample_indices = random.sample(
-            range(len(successful_requests)), min(5, len(successful_requests))
-        )
-        sample_prompts = [
-            req.request_args.replace("\n", " ").replace('"', "'")
-            if (req := successful_requests[i]).request_args
-            else ""
-            for i in sample_indices
-        ]
-        sample_outputs = [
-            req.output.replace("\n", " ").replace('"', "'")
-            if (req := successful_requests[i]).output
-            else ""
-            for i in sample_indices
-        ]
-
-        prompt_tokens = [
-            float(req.prompt_tokens) if req.prompt_tokens is not None else -1
-            for bm in benchmarks
-            for req in bm.requests.successful
-        ]
-        output_tokens = [
-            float(req.output_tokens) if req.output_tokens is not None else -1
-            for bm in benchmarks
-            for req in bm.requests.successful
-        ]
-
-        prompt_token_buckets, _prompt_token_bucket_width = Bucket.from_data(
-            prompt_tokens, 1
-        )
-        output_token_buckets, _output_token_bucket_width = Bucket.from_data(
-            output_tokens, 1
-        )
-
-        prompt_token_stats = DistributionSummary.from_values(prompt_tokens)
-        output_token_stats = DistributionSummary.from_values(output_tokens)
-        prompt_token_distributions = Distribution(
-            statistics=prompt_token_stats, buckets=prompt_token_buckets, bucket_width=1
-        )
-        output_token_distributions = Distribution(
-            statistics=output_token_stats, buckets=output_token_buckets, bucket_width=1
-        )
-
-        min_start_time = benchmarks[0].start_time
-
-        all_req_times = [
-            req.info.timings.request_start - min_start_time
-            for bm in benchmarks
-            for req in bm.requests.successful
-            if req.info.timings.request_start is not None
-        ]
-        number_of_buckets = len(benchmarks)
-        request_over_time_buckets, bucket_width = Bucket.from_data(
-            all_req_times, None, number_of_buckets
-        )
-        request_over_time_distribution = Distribution(
-            buckets=request_over_time_buckets, bucket_width=bucket_width
-        )
-        return cls(
-            prompts=TokenDetails(
-                samples=sample_prompts, token_distributions=prompt_token_distributions
-            ),
-            generations=TokenDetails(
-                samples=sample_outputs, token_distributions=output_token_distributions
-            ),
-            requests_over_time=RequestOverTime(
-                requests_over_time=request_over_time_distribution,
-                num_benchmarks=number_of_buckets,
-            ),
-            rate_type=rate_type,
-            server=Server(target=target),
-        )
-
-
-class TabularDistributionSummary(DistributionSummary):
-    """
-    Same fields as `DistributionSummary`, but adds a ready-to-serialize/iterate
-    `percentile_rows` helper.
-    """
-
-    @computed_field
-    def percentile_rows(self) -> list[dict[str, str | float]]:
-        rows = [
-            {"percentile": name, "value": value}
-            for name, value in self.percentiles.model_dump().items()
-        ]
-        return list(
-            filter(lambda row: row["percentile"] in ["p50", "p90", "p95", "p99"], rows)
-        )
-
-    @classmethod
-    def from_distribution_summary(
-        cls, distribution: DistributionSummary
-    ) -> "TabularDistributionSummary":
-        return cls(**distribution.model_dump())
-
-
-class BenchmarkDatum(BaseModel):
-    requests_per_second: float
-    itl: TabularDistributionSummary
-    ttft: TabularDistributionSummary
-    throughput: TabularDistributionSummary
-    time_per_request: TabularDistributionSummary
-
-    @classmethod
-    def from_benchmark(cls, bm: "GenerativeBenchmark"):
-        return cls(
-            requests_per_second=bm.metrics.requests_per_second.successful.mean,
-            itl=TabularDistributionSummary.from_distribution_summary(
-                bm.metrics.inter_token_latency_ms.successful
-            ),
-            ttft=TabularDistributionSummary.from_distribution_summary(
-                bm.metrics.time_to_first_token_ms.successful
-            ),
-            throughput=TabularDistributionSummary.from_distribution_summary(
-                bm.metrics.output_tokens_per_second.successful
-            ),
-            time_per_request=TabularDistributionSummary.from_distribution_summary(
-                bm.metrics.request_latency.successful
-            ),
-        )
diff --git a/src/guidellm/presentation/injector.py b/src/guidellm/presentation/injector.py
deleted file mode 100644
index 1e78080e..00000000
--- a/src/guidellm/presentation/injector.py
+++ /dev/null
@@ -1,65 +0,0 @@
-import re
-from pathlib import Path
-
-from loguru import logger
-
-from guidellm.settings import settings
-from guidellm.utils.text import load_text
-
-
-def create_report(js_data: dict, output_path: str | Path) -> Path:
-    """
-    Creates a report from the dictionary and saves it to the output path.
-
-    :param js_data: dict with match str and json data to inject
-    :type js_data: dict
-    :param output_path: the file to save the report to.
-    :type output_path: str
-    :return: the path to the saved report
-    :rtype: str
-    """
-
-    if not isinstance(output_path, Path):
-        output_path = Path(output_path)
-
-    html_content = load_text(settings.report_generation.source)
-    report_content = inject_data(
-        js_data,
-        html_content,
-    )
-
-    output_path.parent.mkdir(parents=True, exist_ok=True)
-    output_path.write_text(report_content)
-    return output_path
-
-
-def inject_data(
-    js_data: dict,
-    html: str,
-) -> str:
-    """
-    Injects the json data into the HTML,
-    replacing placeholders only within the <head> section.
-
-    :param js_data: the json data to inject
-    :type js_data: dict
-    :param html: the html to inject the data into
-    :type html: str
-    :return: the html with the json data injected
-    :rtype: str
-    """
-    head_match = re.search(r"<head[^>]*>(.*?)</head>", html, re.DOTALL | re.IGNORECASE)
-    if not head_match:
-        logger.warning("<head> section missing, returning original HTML.")
-
-        return html
-
-    head_content = head_match.group(1)
-
-    # Replace placeholders only inside the <head> content
-    for placeholder, script in js_data.items():
-        head_content = head_content.replace(placeholder, script)
-
-    # Rebuild the HTML
-    new_head = f"<head>{head_content}</head>"
-    return html[: head_match.start()] + new_head + html[head_match.end() :]
diff --git a/src/guidellm/scheduler/constraints.py b/src/guidellm/scheduler/constraints.py
index e24419ea..bbf34fb4 100644
--- a/src/guidellm/scheduler/constraints.py
+++ b/src/guidellm/scheduler/constraints.py
@@ -21,9 +21,9 @@
     SchedulerUpdateAction,
     SchedulerUpdateActionProgress,
 )
-from guidellm.schemas import RequestInfo
+from guidellm.schemas import RequestInfo, StandardBaseModel
 from guidellm.settings import settings
-from guidellm.utils import InfoMixin, RegistryMixin, StandardBaseModel
+from guidellm.utils import InfoMixin, RegistryMixin
 
 __all__ = [
     "Constraint",
diff --git a/src/guidellm/scheduler/scheduler.py b/src/guidellm/scheduler/scheduler.py
index 6da76438..1b5e28f6 100644
--- a/src/guidellm/scheduler/scheduler.py
+++ b/src/guidellm/scheduler/scheduler.py
@@ -24,7 +24,7 @@
 from guidellm.scheduler.strategies import SchedulingStrategy
 from guidellm.scheduler.worker_group import WorkerProcessGroup
 from guidellm.schemas import RequestInfo
-from guidellm.utils.singleton import ThreadSafeSingletonMixin
+from guidellm.utils import ThreadSafeSingletonMixin
 
 __all__ = ["Scheduler"]
 
diff --git a/src/guidellm/scheduler/schemas.py b/src/guidellm/scheduler/schemas.py
index 21567c67..b202b010 100644
--- a/src/guidellm/scheduler/schemas.py
+++ b/src/guidellm/scheduler/schemas.py
@@ -16,8 +16,8 @@
 from pydantic import Field
 from typing_extensions import TypeAliasType, TypedDict
 
-from guidellm.schemas import RequestInfo
-from guidellm.utils import RegistryMixin, StandardBaseModel
+from guidellm.schemas import RequestInfo, StandardBaseModel
+from guidellm.utils import RegistryMixin
 from guidellm.utils.registry import RegistryObjT
 
 __all__ = [
diff --git a/src/guidellm/scheduler/strategies.py b/src/guidellm/scheduler/strategies.py
index e1473b93..9fce40ab 100644
--- a/src/guidellm/scheduler/strategies.py
+++ b/src/guidellm/scheduler/strategies.py
@@ -20,8 +20,8 @@
 
 from pydantic import Field, PrivateAttr
 
-from guidellm.schemas import RequestInfo
-from guidellm.utils import InfoMixin, PydanticClassRegistryMixin
+from guidellm.schemas import PydanticClassRegistryMixin, RequestInfo
+from guidellm.utils import InfoMixin
 
 __all__ = [
     "AsyncConstantStrategy",
diff --git a/src/guidellm/scheduler/worker.py b/src/guidellm/scheduler/worker.py
index 977635fa..6f37b1da 100644
--- a/src/guidellm/scheduler/worker.py
+++ b/src/guidellm/scheduler/worker.py
@@ -408,9 +408,8 @@ async def _dequeue_next_request(
     async def _schedule_request(
         self, request: RequestT, request_info: RequestInfo, target_start: float
     ):
-        current_time = time.time()
-        request_info.timings.scheduled_at = current_time
-        if target_start > current_time:
+        request_info.timings.scheduled_at = request_info.timings.dequeued
+        if target_start > (current_time := time.time()):
             await asyncio.sleep(target_start - current_time)
             # Adapt delay so that scheduled at reflects the sleep time
             request_info.timings.scheduled_at = target_start
diff --git a/src/guidellm/scheduler/worker_group.py b/src/guidellm/scheduler/worker_group.py
index 2a0a51de..d30403a6 100644
--- a/src/guidellm/scheduler/worker_group.py
+++ b/src/guidellm/scheduler/worker_group.py
@@ -228,11 +228,11 @@ async def create_processes(self):
 
             worker = WorkerProcess[RequestT, ResponseT](
                 worker_index=rank,
-                messaging=self.messaging.create_worker_copy(
+                messaging=self.messaging.create_worker_copy(  # type: ignore[arg-type]
                     worker_index=rank,
                     max_buffer_send_size=None,
                     max_buffer_receive_size=per_proc_max_buffer_size,
-                ),  # The non-group worker lacks the SchedulerState type. Type err.
+                ),
                 backend=self.backend,
                 strategy=self.strategy,
                 async_limit=async_limit,
@@ -632,6 +632,8 @@ def _locked_update(
         )
 
     def _update_state_request_counts(self, info: RequestInfo):
+        finalized = time.time()
+
         if info.status == "queued":
             self._queued_request_ids.add(info.request_id)
             self._state.queued_requests = len(self._queued_request_ids)
@@ -647,11 +649,13 @@ def _update_state_request_counts(self, info: RequestInfo):
             self._processing_request_ids.add(info.request_id)
             self._state.processing_requests = len(self._processing_request_ids)
         elif info.status == "completed":
+            info.timings.finalized = finalized
             self._processing_request_ids.remove(info.request_id)
             self._state.processing_requests = len(self._processing_request_ids)
             self._state.processed_requests += 1
             self._state.successful_requests += 1
         elif info.status in ("errored", "cancelled"):
+            info.timings.finalized = finalized
             if info.request_id in self._queued_request_ids:
                 self._queued_request_ids.remove(info.request_id)
                 self._state.queued_requests = len(self._queued_request_ids)
diff --git a/src/guidellm/schemas/__init__.py b/src/guidellm/schemas/__init__.py
index 42268f72..d230c204 100644
--- a/src/guidellm/schemas/__init__.py
+++ b/src/guidellm/schemas/__init__.py
@@ -9,6 +9,13 @@
 
 from __future__ import annotations
 
+from .base import (
+    PydanticClassRegistryMixin,
+    ReloadableBaseModel,
+    StandardBaseDict,
+    StandardBaseModel,
+    StatusBreakdown,
+)
 from .info import RequestInfo, RequestTimings
 from .request import (
     GenerationRequest,
@@ -16,16 +23,31 @@
     GenerativeRequestType,
     UsageMetrics,
 )
+from .request_stats import GenerativeRequestStats
 from .response import GenerationResponse
-from .stats import GenerativeRequestStats
+from .statistics import (
+    DistributionSummary,
+    FunctionObjT,
+    Percentiles,
+    StatusDistributionSummary,
+)
 
 __all__ = [
+    "DistributionSummary",
+    "FunctionObjT",
     "GenerationRequest",
     "GenerationRequestArguments",
     "GenerationResponse",
     "GenerativeRequestStats",
     "GenerativeRequestType",
+    "Percentiles",
+    "PydanticClassRegistryMixin",
+    "ReloadableBaseModel",
     "RequestInfo",
     "RequestTimings",
+    "StandardBaseDict",
+    "StandardBaseModel",
+    "StatusBreakdown",
+    "StatusDistributionSummary",
     "UsageMetrics",
 ]
diff --git a/src/guidellm/utils/pydantic_utils.py b/src/guidellm/schemas/base.py
similarity index 100%
rename from src/guidellm/utils/pydantic_utils.py
rename to src/guidellm/schemas/base.py
diff --git a/src/guidellm/schemas/info.py b/src/guidellm/schemas/info.py
index 4b5d188c..854756d4 100644
--- a/src/guidellm/schemas/info.py
+++ b/src/guidellm/schemas/info.py
@@ -14,7 +14,7 @@
 
 from pydantic import Field, computed_field
 
-from guidellm.utils import StandardBaseDict, StandardBaseModel
+from guidellm.schemas.base import StandardBaseDict, StandardBaseModel
 
 __all__ = ["RequestInfo", "RequestTimings"]
 
@@ -53,17 +53,23 @@ class RequestTimings(StandardBaseDict):
         default=None,
         description="Unix timestamp when the backend began processing the request",
     )
-    first_iteration: float | None = Field(
+    first_request_iteration: float | None = Field(
         default=None,
-        description="Unix timestamp when the first iteration for a streaming began",
     )
-    last_iteration: float | None = Field(
+    first_token_iteration: float | None = Field(
         default=None,
-        description="Unix timestamp when the last iteration for a streaming completed",
     )
-    iterations: int | None = Field(
+    last_token_iteration: float | None = Field(
         default=None,
-        description="Total number of streaming update iterations performed",
+    )
+    last_request_iteration: float | None = Field(
+        default=None,
+    )
+    request_iterations: int = Field(
+        default=0,
+    )
+    token_iterations: int = Field(
+        default=0,
     )
     request_end: float | None = Field(
         default=None,
@@ -78,6 +84,25 @@ class RequestTimings(StandardBaseDict):
         description="Unix timestamp when request was processed by the scheduler",
     )
 
+    @property
+    def last_reported(self) -> float | None:
+        """
+        Get the most recent timing measurement available.
+
+        :return: The latest Unix timestamp from the timing fields, or None if none
+        """
+        timing_fields = [
+            self.queued,
+            self.dequeued,
+            self.scheduled_at,
+            self.resolve_start,
+            self.request_start,
+            self.request_end,
+            self.resolve_end,
+        ]
+        valid_timings = [field for field in timing_fields if field is not None]
+        return max(valid_timings) if valid_timings else None
+
 
 class RequestInfo(StandardBaseModel):
     """
diff --git a/src/guidellm/schemas/request.py b/src/guidellm/schemas/request.py
index 1f90d130..ed9a31f4 100644
--- a/src/guidellm/schemas/request.py
+++ b/src/guidellm/schemas/request.py
@@ -14,7 +14,7 @@
 
 from pydantic import Field, computed_field
 
-from guidellm.utils import StandardBaseDict, StandardBaseModel
+from guidellm.schemas.base import StandardBaseDict, StandardBaseModel
 
 __all__ = [
     "GenerationRequest",
diff --git a/src/guidellm/schemas/request_stats.py b/src/guidellm/schemas/request_stats.py
new file mode 100644
index 00000000..10db80be
--- /dev/null
+++ b/src/guidellm/schemas/request_stats.py
@@ -0,0 +1,333 @@
+"""
+Request statistics and metrics for generative AI benchmark analysis.
+
+Provides data structures for capturing and analyzing performance metrics from
+generative AI workloads. The module contains request-level statistics including
+token counts, latency measurements, and throughput calculations essential for
+evaluating text generation benchmark performance. Computed properties enable
+analysis of time-to-first-token, inter-token latency, and token generation rates.
+"""
+
+from __future__ import annotations
+
+from typing import Literal
+
+import numpy as np
+from pydantic import Field, computed_field
+
+from guidellm.schemas.base import StandardBaseDict
+from guidellm.schemas.info import RequestInfo
+from guidellm.schemas.request import GenerativeRequestType, UsageMetrics
+
+__all__ = ["GenerativeRequestStats"]
+
+
+class GenerativeRequestStats(StandardBaseDict):
+    """
+    Request statistics for generative AI text generation workloads.
+
+    Captures comprehensive performance metrics for individual generative requests,
+    including token counts, timing measurements, and derived performance statistics.
+    Provides computed properties for latency analysis, throughput calculations,
+    and token generation metrics essential for benchmark evaluation.
+
+    Example:
+    ::
+        stats = GenerativeRequestStats(
+            request_id="req_123",
+            request_type="text_completion",
+            info=request_info,
+            input_metrics=input_usage,
+            output_metrics=output_usage
+        )
+        throughput = stats.output_tokens_per_second
+    """
+
+    type_: Literal["generative_request_stats"] = "generative_request_stats"
+    request_id: str = Field(description="Unique identifier for the request")
+    request_type: GenerativeRequestType | str = Field(
+        description="Type of generative request (text_completion or chat_completion)"
+    )
+    request_args: str | None = Field(
+        default=None, description="Backend arguments used for this request"
+    )
+    output: str | None = Field(
+        default=None, description="Generated text output from the request"
+    )
+    info: RequestInfo = Field(description="Request metadata and timing information")
+    input_metrics: UsageMetrics = Field(
+        description="Token usage statistics for the input prompt"
+    )
+    output_metrics: UsageMetrics = Field(
+        description="Token usage statistics for the generated output"
+    )
+
+    # Request stats
+    @computed_field  # type: ignore[misc]
+    @property
+    def request_start_time(self) -> float | None:
+        """
+        :return: Timestamp when the request started, or None if unavailable
+        """
+        return self.info.timings.request_start or self.info.timings.resolve_start
+
+    @computed_field  # type: ignore[misc]
+    @property
+    def request_end_time(self) -> float:
+        """
+        :return: Timestamp when the request ended, or None if unavailable
+        """
+        if self.info.timings.resolve_end is None:
+            raise ValueError("resolve_end timings should be set but is None.")
+
+        return self.info.timings.request_end or self.info.timings.resolve_end
+
+    @computed_field  # type: ignore[misc]
+    @property
+    def request_latency(self) -> float | None:
+        """
+        End-to-end request processing latency in seconds.
+
+        :return: Duration from request start to completion, or None if unavailable
+        """
+        if not (start := self.info.timings.request_start) or not (
+            end := self.info.timings.request_end
+        ):
+            return None
+
+        return end - start
+
+    # General token stats
+    @computed_field  # type: ignore[misc]
+    @property
+    def prompt_tokens(self) -> int | None:
+        """
+        :return: Number of tokens in the input prompt, or None if unavailable
+        """
+        return self.input_metrics.text_tokens
+
+    @computed_field  # type: ignore[misc]
+    @property
+    def input_tokens(self) -> int | None:
+        """
+        :return: Number of tokens in the input prompt, or None if unavailable
+        """
+        return self.input_metrics.total_tokens
+
+    @computed_field  # type: ignore[misc]
+    @property
+    def output_tokens(self) -> int | None:
+        """
+        :return: Number of tokens in the generated output, or None if unavailable
+        """
+        return self.output_metrics.total_tokens
+
+    @computed_field  # type: ignore[misc]
+    @property
+    def total_tokens(self) -> int | None:
+        """
+        :return: Sum of prompt and output tokens, or None if both unavailable
+        """
+        input_tokens = self.input_metrics.total_tokens
+        output_tokens = self.output_metrics.total_tokens
+
+        if input_tokens is None and output_tokens is None:
+            return None
+
+        return (input_tokens or 0) + (output_tokens or 0)
+
+    @computed_field  # type: ignore[misc]
+    @property
+    def time_to_first_token_ms(self) -> float | None:
+        """
+        :return: Time to first token generation in milliseconds, or None if unavailable
+        """
+        if not (first_token := self.first_token_iteration) or not (
+            start := self.info.timings.request_start
+        ):
+            return None
+
+        return 1000 * (first_token - start)
+
+    @computed_field  # type: ignore[misc]
+    @property
+    def time_per_output_token_ms(self) -> float | None:
+        """
+        Average time per output token in milliseconds including first token.
+
+        :return: Average milliseconds per output token, or None if unavailable
+        """
+        if (
+            not (start := self.info.timings.request_start)
+            or not (last_token := self.last_token_iteration)
+            or not (output_tokens := self.output_tokens)
+        ):
+            return None
+
+        return 1000 * (last_token - start) / output_tokens
+
+    @computed_field  # type: ignore[misc]
+    @property
+    def inter_token_latency_ms(self) -> float | None:
+        """
+        Average inter-token latency in milliseconds excluding first token.
+
+        :return: Average milliseconds between token generations, or None if unavailable
+        """
+        if (
+            not (first_token := self.first_token_iteration)
+            or not (last_token := self.last_token_iteration)
+            or not (output_tokens := self.output_tokens)
+            or output_tokens <= 1
+        ):
+            return None
+
+        return 1000 * (last_token - first_token) / (output_tokens - 1)
+
+    @computed_field  # type: ignore[misc]
+    @property
+    def tokens_per_second(self) -> float | None:
+        """
+        :return: Total tokens per second throughput, or None if unavailable
+        """
+        if not (latency := self.request_latency) or self.total_tokens is None:
+            return None
+
+        return self.total_tokens / latency
+
+    @computed_field  # type: ignore[misc]
+    @property
+    def output_tokens_per_second(self) -> float | None:
+        """
+        :return: Output token generation throughput, or None if unavailable
+        """
+        if not (latency := self.request_latency) or self.output_tokens is None:
+            return None
+
+        return self.output_tokens / latency
+
+    @computed_field  # type: ignore[misc]
+    @property
+    def iter_tokens_per_iteration(self) -> float | None:
+        """
+        :return: Average tokens per iteration excluding first token, or None if
+            unavailable
+        """
+        if (
+            self.output_tokens is None
+            or self.output_tokens <= 1
+            or self.token_iterations <= 1
+        ):
+            return None
+
+        return (self.output_tokens - 1.0) / (
+            self.token_iterations - 1.0
+        )  # subtract 1 for first token from the prompt, assume first iter is 1 token
+
+    @computed_field  # type: ignore[misc]
+    @property
+    def output_tokens_per_iteration(self) -> float | None:
+        """
+        :return: Average output tokens per iteration, or None if unavailable
+        """
+        if self.output_tokens is None or self.token_iterations < 1:
+            return None
+
+        return self.output_tokens / self.token_iterations
+
+    @property
+    def first_token_iteration(self) -> float | None:
+        """
+        :return: Timestamp of first token generation, or None if unavailable
+        """
+        return self.info.timings.first_token_iteration
+
+    @property
+    def last_token_iteration(self) -> float | None:
+        """
+        :return: Timestamp of last token generation, or None if unavailable
+        """
+        return self.info.timings.last_token_iteration
+
+    @property
+    def token_iterations(self) -> int:
+        """
+        :return: Total number of token generation iterations
+        """
+        return self.info.timings.token_iterations
+
+    @property
+    def prompt_tokens_timing(self) -> tuple[float, float] | None:
+        """
+        :return: Tuple of (timestamp, token_count) for prompt processing, or None
+            if unavailable
+        """
+        if self.request_end_time is None:
+            # no end time, can't compute
+            return None
+
+        return (
+            self.first_token_iteration or self.request_end_time,
+            self.prompt_tokens or 0.0,
+        )
+
+    @property
+    def output_tokens_timings(self) -> list[tuple[float, float]]:
+        """
+        :return: List of (timestamp, token_count) tuples for output token generations
+        """
+        if self.request_end_time is None:
+            # no end time, can't compute
+            return []
+
+        if (
+            self.first_token_iteration is None
+            or self.last_token_iteration is None
+            or self.token_iterations <= 1
+        ):
+            # No iteration data, return single timing at end with all tokens
+            return [
+                (
+                    self.last_token_iteration or self.request_end_time,
+                    self.output_tokens or 0.0,
+                )
+            ]
+
+        # Return first token timing as 1 token plus per-iteration timings
+        return [
+            (self.first_token_iteration, 1.0 * bool(self.output_tokens))
+        ] + self.iter_tokens_timings
+
+    @property
+    def iter_tokens_timings(self) -> list[tuple[float, float]]:
+        """
+        :return: List of (timestamp, token_count) tuples for iterations excluding
+            first token
+        """
+        if (
+            self.first_token_iteration is None
+            or self.last_token_iteration is None
+            or (tok_per_iter := self.iter_tokens_per_iteration) is None
+            or self.token_iterations <= 1
+        ):
+            return []
+
+        # evenly space the iterations since we don't have per-iteration timings
+        # / we don't know the individual token counts per iteration
+        iter_times = np.linspace(
+            self.first_token_iteration,
+            self.last_token_iteration,
+            num=self.token_iterations,
+        )[1:]  # skip first iteration
+
+        return [(iter_time, tok_per_iter) for iter_time in iter_times]
+
+    @property
+    def total_tokens_timings(self) -> list[tuple[float, float]]:
+        """
+        :return: List of (timestamp, token_count) tuples for all token generations
+        """
+        prompt_timings = self.prompt_tokens_timing
+        output_timings = self.output_tokens_timings
+
+        return ([prompt_timings] if prompt_timings else []) + output_timings
diff --git a/src/guidellm/schemas/response.py b/src/guidellm/schemas/response.py
index d4e53aa3..a02ae8ba 100644
--- a/src/guidellm/schemas/response.py
+++ b/src/guidellm/schemas/response.py
@@ -11,10 +11,10 @@
 
 from pydantic import Field
 
+from guidellm.schemas.base import StandardBaseModel
 from guidellm.schemas.info import RequestInfo
 from guidellm.schemas.request import GenerationRequest, UsageMetrics
-from guidellm.schemas.stats import GenerativeRequestStats
-from guidellm.utils import StandardBaseModel
+from guidellm.schemas.request_stats import GenerativeRequestStats
 
 __all__ = ["GenerationResponse"]
 
diff --git a/src/guidellm/schemas/statistics.py b/src/guidellm/schemas/statistics.py
new file mode 100644
index 00000000..bbfe666d
--- /dev/null
+++ b/src/guidellm/schemas/statistics.py
@@ -0,0 +1,1002 @@
+"""
+Statistical distribution analysis and summary calculations for benchmark metrics.
+
+Provides comprehensive statistical analysis tools including percentile calculations,
+summary statistics, and status-based distributions. Supports value distributions,
+time-based rate and concurrency distributions with weighted sampling, and probability
+density functions for analyzing benchmark performance metrics and request patterns
+across different status categories (successful, incomplete, errored).
+"""
+
+from __future__ import annotations
+
+import math
+from collections.abc import Callable, Sequence
+from typing import Literal, TypeVar
+
+import numpy as np
+from pydantic import Field
+
+from guidellm.schemas.base import StandardBaseModel, StatusBreakdown
+
+__all__ = [
+    "DistributionSummary",
+    "FunctionObjT",
+    "Percentiles",
+    "StatusDistributionSummary",
+]
+
+FunctionObjT = TypeVar("FunctionObjT")
+
+
+class Percentiles(StandardBaseModel):
+    """
+    Standard percentile values for probability distributions.
+
+    Captures key percentile points from 0.1th to 99.9th percentile for comprehensive
+    distribution analysis, enabling assessment of central tendency, spread, and tail
+    behavior in benchmark metrics.
+    """
+
+    p001: float = Field(description="0.1th percentile value")
+    p01: float = Field(description="1st percentile value")
+    p05: float = Field(description="5th percentile value")
+    p10: float = Field(description="10th percentile value")
+    p25: float = Field(description="25th percentile value")
+    p50: float = Field(description="50th percentile (median) value")
+    p75: float = Field(description="75th percentile value")
+    p90: float = Field(description="90th percentile value")
+    p95: float = Field(description="95th percentile value")
+    p99: float = Field(description="99th percentile value")
+    p999: float = Field(description="99.9th percentile value")
+
+    @classmethod
+    def from_pdf(
+        cls, pdf: np.ndarray, epsilon: float = 1e-6, validate: bool = True
+    ) -> Percentiles:
+        """
+        Create percentiles from a probability density function.
+
+        :param pdf: 2D array (N, 2) with values in column 0 and probabilities in
+            column 1
+        :param epsilon: Tolerance for probability sum validation
+        :param validate: Whether to validate probabilities sum to 1 and are
+            non-negative
+        :return: Percentiles object with computed values
+        :raises ValueError: If PDF shape is invalid, probabilities are negative,
+            or probabilities don't sum to 1
+        """
+        expected_shape = (None, 2)
+
+        if len(pdf.shape) != len(expected_shape) or pdf.shape[1] != expected_shape[1]:
+            raise ValueError(
+                "PDF must be a 2D array of shape (N, 2) where first column is values "
+                f"and second column is probabilities. Got {pdf.shape} instead."
+            )
+
+        percentile_probs = {
+            "p001": 0.001,
+            "p01": 0.01,
+            "p05": 0.05,
+            "p10": 0.1,
+            "p25": 0.25,
+            "p50": 0.5,
+            "p75": 0.75,
+            "p90": 0.9,
+            "p95": 0.95,
+            "p99": 0.99,
+            "p999": 0.999,
+        }
+
+        if pdf.shape[0] == 0:
+            return Percentiles(**dict.fromkeys(percentile_probs.keys(), 0.0))
+
+        probabilities = pdf[:, 1]
+
+        if validate:
+            if np.any(probabilities < 0):
+                raise ValueError("Probabilities must be non-negative.")
+
+            prob_sum = np.sum(probabilities)
+            if abs(prob_sum - 1.0) > epsilon:
+                raise ValueError(f"Probabilities must sum to 1, got {prob_sum}.")
+
+        cdf_probs = np.cumsum(probabilities)
+
+        return Percentiles(
+            **{
+                key: pdf[np.searchsorted(cdf_probs, value, side="left"), 0].item()
+                for key, value in percentile_probs.items()
+            }
+        )
+
+
+class DistributionSummary(StandardBaseModel):
+    """
+    Comprehensive statistical summary of a probability distribution.
+
+    Captures central tendency (mean, median, mode), spread (variance, std_dev),
+    extrema (min, max), and percentile information with optional probability density
+    function. Supports creation from raw values, PDFs, or time-based event data for
+    rate and concurrency analysis in benchmark metrics.
+    """
+
+    mean: float = Field(description="Mean/average value")
+    median: float = Field(description="Median (50th percentile) value")
+    mode: float = Field(description="Mode (most probable) value")
+    variance: float = Field(description="Variance of the distribution")
+    std_dev: float = Field(description="Standard deviation")
+    min: float = Field(description="Minimum value")
+    max: float = Field(description="Maximum value")
+    count: int = Field(description="Number of observations")
+    total_sum: float = Field(description="Sum of all values")
+    percentiles: Percentiles = Field(description="Standard percentile values")
+    pdf: list[tuple[float, float]] | None = Field(
+        description="Probability density function as (value, probability) pairs",
+        default=None,
+    )
+
+    @classmethod
+    def from_pdf(
+        cls,
+        pdf: np.ndarray,
+        count: int | None = None,
+        include_pdf: bool | int = False,
+        epsilon: float = 1e-6,
+        validate: bool = True,
+    ) -> DistributionSummary:
+        """
+        Create distribution summary from a probability density function.
+
+        :param pdf: 2D array (N, 2) with values in column 0 and probabilities in
+            column 1
+        :param count: Number of original observations; defaults to PDF length
+        :param include_pdf: Whether to include PDF; True for full, int for sampled size
+        :param epsilon: Tolerance for probability validation
+        :param validate: Whether to validate probabilities sum to 1 and are non-negative
+        :return: Complete distribution summary with statistics
+        :raises ValueError: If PDF shape is invalid or probabilities are invalid
+        """
+        expected_shape = (None, 2)
+
+        if len(pdf.shape) != len(expected_shape) or pdf.shape[1] != expected_shape[1]:
+            raise ValueError(
+                "PDF must be a 2D array of shape (N, 2) where first column is values "
+                f"and second column is probabilities. Got {pdf.shape} instead."
+            )
+
+        if pdf.shape[0] == 0:
+            return DistributionSummary(
+                mean=0.0,
+                median=0.0,
+                mode=0.0,
+                variance=0.0,
+                std_dev=0.0,
+                min=0.0,
+                max=0.0,
+                count=0 if count is None else count,
+                total_sum=0.0,
+                percentiles=Percentiles.from_pdf(pdf, epsilon=epsilon),
+                pdf=None if include_pdf is False else [],
+            )
+
+        # Calculate stats
+        values = pdf[:, 0]
+        probabilities = pdf[:, 1]
+
+        if validate:
+            # Fail if probabilities don't sum to 1 or are negative
+            if np.any(probabilities < 0):
+                raise ValueError("Probabilities must be non-negative.")
+
+            prob_sum = np.sum(probabilities)
+            if not np.isclose(prob_sum, 1.0, atol=epsilon):
+                raise ValueError(f"Probabilities must sum to 1.0 (sum={prob_sum}).")
+
+            # Fail if values are not sorted
+            if not np.all(values[:-1] <= values[1:]):
+                raise ValueError("Values in PDF must be sorted in ascending order.")
+
+        percentiles = Percentiles.from_pdf(pdf, epsilon=epsilon, validate=False)
+        median = percentiles.p50
+        mean = np.sum(values * probabilities).item()
+        mode = values[np.argmax(probabilities)].item()
+        variance = np.sum((values - mean) ** 2 * probabilities).item()
+        std_dev = math.sqrt(variance)
+        minimum = values[0].item()
+        maximum = values[-1].item()
+
+        if count is None:
+            count = len(pdf)
+
+        total_sum = mean * count
+
+        if include_pdf is False:
+            sampled_pdf = None
+        elif include_pdf is True:
+            sampled_pdf = pdf.tolist()
+        else:
+            sampled_pdf = []
+
+        return DistributionSummary(
+            mean=mean,
+            median=median,
+            mode=mode,
+            variance=variance,
+            std_dev=std_dev,
+            min=minimum,
+            max=maximum,
+            count=count,
+            total_sum=total_sum,
+            percentiles=percentiles,
+            pdf=sampled_pdf,
+        )
+
+    @classmethod
+    def from_values(
+        cls,
+        values: Sequence[float | tuple[float, float]] | np.ndarray,
+        count: int | None = None,
+        include_pdf: bool | int = False,
+        epsilon: float = 1e-6,
+    ) -> DistributionSummary:
+        """
+        Create distribution summary from raw values with optional weights.
+
+        :param values: Values or (value, weight) tuples, or numpy array
+        :param count: Number of original observations; defaults to sum of weights
+        :param include_pdf: Whether to include PDF; True for full, int for sampled size
+        :param epsilon: Tolerance for probability validation
+        :return: Distribution summary computed from the values
+        :raises ValueError: If total weight is zero or invalid
+        """
+        np_values = cls._to_weighted_ndarray(values, num_values_per_item=2)
+
+        if np_values.shape[0] == 0:
+            return DistributionSummary.from_pdf(
+                pdf=np.empty((0, 2)), count=0, include_pdf=include_pdf, epsilon=epsilon
+            )
+
+        if count is None:
+            count = round(np.sum(np_values[:, 1]).item())
+
+        # Sort values and weights by values
+        sort_ind = np.argsort(np_values[:, 0])
+        sorted_values = np_values[sort_ind, 0]
+        sorted_weights = np_values[sort_ind, 1]
+
+        # Combine any duplicate values by summing their weights
+        unique_values, inverse_indices = np.unique(sorted_values, return_inverse=True)
+        combined_weights = np.zeros_like(unique_values, dtype=float)
+        np.add.at(combined_weights, inverse_indices, sorted_weights)
+
+        # Remove any values with zero weight
+        nonzero_mask = combined_weights > 0
+        final_values = unique_values[nonzero_mask]
+        final_weights = combined_weights[nonzero_mask]
+
+        # Create PDF by normalizing weights and stacking
+        total_weight = np.sum(final_weights)
+        if total_weight <= epsilon:
+            # No valid weights to create PDF, overwrite to uniform distribution
+            final_weights = np.ones_like(final_values)
+            total_weight = np.sum(final_weights)
+
+        probabilities = final_weights / total_weight
+        pdf = np.column_stack((final_values, probabilities))
+
+        return DistributionSummary.from_pdf(
+            pdf=pdf,
+            count=count,
+            include_pdf=include_pdf,
+            epsilon=epsilon,
+            validate=False,
+        )
+
+    @classmethod
+    def rate_distribution_from_timings(
+        cls,
+        event_times: Sequence[float | tuple[float, float]] | np.ndarray,
+        start_time: float | None = None,
+        end_time: float | None = None,
+        threshold: float | None = 1e-4,  # 1/10th of a millisecond
+        include_pdf: bool | int = False,
+        epsilon: float = 1e-6,
+    ) -> DistributionSummary:
+        """
+        Create rate distribution from event timestamps.
+
+        Computes event rates over time intervals weighted by interval duration for
+        analyzing request throughput patterns.
+
+        :param event_times: Event timestamps or (timestamp, weight) tuples
+        :param start_time: Analysis window start; filters earlier events
+        :param end_time: Analysis window end; filters later events
+        :param threshold: Time threshold for merging nearby events; 1/10th millisecond
+        :param include_pdf: Whether to include PDF; True for full, int for sampled size
+        :param epsilon: Tolerance for probability validation
+        :return: Distribution summary of event rates over time
+        """
+        weighted_times = cls._to_weighted_ndarray(event_times, num_values_per_item=2)
+
+        if start_time is not None:
+            # Filter out any times before start, insert start time with 0 weight
+            weighted_times = np.insert(
+                weighted_times[weighted_times[:, 0] >= start_time],
+                0,
+                [start_time, 0.0],
+                axis=0,
+            )
+
+        if end_time is not None:
+            # Filter out any times after end, insert end time with 0 weight
+            weighted_times = np.append(
+                weighted_times[weighted_times[:, 0] <= end_time],
+                [[end_time, 0.0]],
+                axis=0,
+            )
+
+        # Sort by time for merging, merge any times within threshold
+        sort_ind = np.argsort(weighted_times[:, 0])
+        weighted_times = weighted_times[sort_ind]
+        weighted_times = cls._merge_sorted_times_with_weights(weighted_times, threshold)
+
+        if len(weighted_times) <= 1:
+            # No data to calculate rates from (need at least two times)
+            return cls.from_values(
+                [],
+                count=len(weighted_times),
+                include_pdf=include_pdf,
+                epsilon=epsilon,
+            )
+
+        times = weighted_times[:, 0]
+        occurrences = weighted_times[:, 1]
+
+        # Calculate local duration for each event: ((times[i+1] - times[i-1])) / 2
+        midpoints = (times[1:] + times[:-1]) / 2
+        durations = np.empty_like(times)
+        durations[0] = midpoints[0] - times[0]
+        durations[1:-1] = midpoints[1:] - midpoints[:-1]
+        durations[-1] = np.clip(times[-1] - midpoints[-1], epsilon, None)
+
+        # Calculate rate at each interval: occurences[i] / duration[i]
+        rates = occurrences / durations
+        count = round(np.sum(occurrences).item())
+
+        return cls.from_values(
+            np.column_stack((rates, durations)),
+            count=count,
+            include_pdf=include_pdf,
+            epsilon=epsilon,
+        )
+
+    @classmethod
+    def concurrency_distribution_from_timings(
+        cls,
+        event_intervals: (
+            Sequence[tuple[float, float] | tuple[float, float, float]] | np.ndarray
+        ),
+        start_time: float | None = None,
+        end_time: float | None = None,
+        threshold: float | None = 1e-4,  # 1/10th of a millisecond
+        include_pdf: bool | int = False,
+        epsilon: float = 1e-6,
+    ) -> DistributionSummary:
+        """
+        Create concurrency distribution from event time intervals.
+
+        Tracks overlapping events to compute concurrency levels over time for analyzing
+        request processing patterns and resource utilization.
+
+        :param event_intervals: Event (start, end) or (start, end, weight) tuples
+        :param start_time: Analysis window start
+        :param end_time: Analysis window end
+        :param threshold: Time threshold for merging nearby transitions;
+            1/10th millisecond
+        :param include_pdf: Whether to include PDF; True for full, int for sampled size
+        :param epsilon: Tolerance for probability validation
+        :return: Distribution summary of concurrency levels over time
+        """
+        weighted_intervals = cls._to_weighted_ndarray(
+            event_intervals, num_values_per_item=3
+        )
+
+        # If start_time, filter any intervals that end before start_time
+        if start_time is not None:
+            keep_mask = weighted_intervals[:, 1] >= start_time
+            weighted_intervals = weighted_intervals[keep_mask]
+
+        # If end_time, filter any intervals that start after end_time
+        if end_time is not None:
+            keep_mask = weighted_intervals[:, 0] <= end_time
+            weighted_intervals = weighted_intervals[keep_mask]
+
+        count = len(weighted_intervals)
+
+        # Convert to concurrency changes at each time
+        add_occurences = (
+            np.stack(
+                (
+                    weighted_intervals[:, 0],
+                    weighted_intervals[:, 2],
+                ),
+                axis=1,
+            )
+            if len(weighted_intervals) > 0
+            else np.empty((0, 2))
+        )
+        remove_occurences = (
+            np.stack(
+                (
+                    weighted_intervals[:, 1],
+                    -1 * weighted_intervals[:, 2],
+                ),
+                axis=1,
+            )
+            if len(weighted_intervals) > 0
+            else np.empty((0, 2))
+        )
+
+        # Combine add and remove occurences into weighted times
+        weighted_times = np.vstack((add_occurences, remove_occurences))
+
+        # Sort by the times and merge any times within threshold
+        weighted_times = weighted_times[np.argsort(weighted_times[:, 0])]
+        weighted_times = cls._merge_sorted_times_with_weights(weighted_times, threshold)
+
+        # If start_time, ensure included (if any before, add final concurrency at start)
+        if start_time is not None and len(weighted_times) > 0:
+            start_ind = np.searchsorted(weighted_times[:, 0], start_time, side="left")
+            prior_delta = (
+                np.sum(weighted_times[:start_ind, 1]) if start_ind > 0 else 0.0
+            )
+            weighted_times = np.insert(
+                weighted_times[start_ind:], 0, [start_time, prior_delta], axis=0
+            )
+
+        # If end_time, ensure included (if any after, filter out)
+        if end_time is not None and len(weighted_times) > 0:
+            end_ind = np.searchsorted(weighted_times[:, 0], end_time, side="right")
+            weighted_times = np.append(
+                weighted_times[:end_ind], [[end_time, 0.0]], axis=0
+            )
+
+        # Calculate concurrency from cumulative sum of changes over time
+        concurrencies = np.clip(np.cumsum(weighted_times[:, 1]), 0, None)
+
+        if len(concurrencies) <= 1:
+            # No data to calculate concurrency from
+            return cls.from_values(
+                [] if count == 0 else [concurrencies[0].item()],
+                include_pdf=include_pdf,
+                epsilon=epsilon,
+            )
+
+        # Calculate durations equal to times[i+1] - times[i]
+        # The last concurrency level is not used since no following time point
+        durations = np.clip(np.diff(weighted_times[:, 0]), 0, None)
+        values = np.column_stack((concurrencies[:-1], durations))
+
+        return (
+            cls.from_values(
+                values,
+                count=count,
+                include_pdf=include_pdf,
+                epsilon=epsilon,
+            )
+            if np.any(durations > 0)
+            else cls.from_values(
+                [],
+                count=count,
+                include_pdf=include_pdf,
+                epsilon=epsilon,
+            )
+        )
+
+    @classmethod
+    def _to_weighted_ndarray(
+        cls,
+        inputs: (
+            Sequence[float | tuple[float, float] | tuple[float, float, float]]
+            | np.ndarray
+        ),
+        num_values_per_item: Literal[2, 3],
+    ) -> np.ndarray:
+        if not isinstance(inputs, np.ndarray):
+            # Convert list to structured numpy array with dims (N, num_dimensions)
+            # Fill in missing weights with 1.0
+            return cls._sequence_to_weighted_ndarray(inputs, num_values_per_item)
+
+        if len(inputs.shape) == 1:
+            # 1D array: reshape to (N, 1) and add weights column
+            inputs = inputs.reshape(-1, 1)
+            weights = np.ones((inputs.shape[0], 1), dtype=float)
+
+            return (
+                np.hstack((inputs, weights))
+                if num_values_per_item == 2  # noqa: PLR2004
+                else np.hstack((inputs, inputs, weights))
+            )
+
+        if len(inputs.shape) == 2 and inputs.shape[1] == num_values_per_item - 1:  # noqa: PLR2004
+            # Add weights column of 1.0
+            weights = np.ones((inputs.shape[0], 1), dtype=float)
+
+            return np.hstack((inputs, weights))
+
+        if len(inputs.shape) == 2 and inputs.shape[1] == num_values_per_item:  # noqa: PLR2004
+            return inputs
+
+        raise ValueError(
+            "inputs must be a numpy array of shape (N,), "
+            f"(N, {num_values_per_item - 1}), or (N, {num_values_per_item}). "
+            f"Got shape {inputs.shape}."
+        )
+
+    @classmethod
+    def _sequence_to_weighted_ndarray(
+        cls,
+        inputs: Sequence[float | tuple[float, float] | tuple[float, float, float]],
+        num_values_per_item: Literal[2, 3],
+    ) -> np.ndarray:
+        ndarray = np.empty((len(inputs), num_values_per_item), dtype=float)
+        scalar_types: tuple[type, ...] = (int, float, np.integer, np.floating)
+
+        for ind, val in enumerate(inputs):
+            if isinstance(val, scalar_types):
+                ndarray[ind, :] = (
+                    (val, 1.0) if num_values_per_item == 2 else (val, val, 1.0)  # noqa: PLR2004
+                )
+            elif isinstance(val, tuple) and len(val) == num_values_per_item:
+                ndarray[ind, :] = val
+            elif isinstance(val, tuple) and len(val) == num_values_per_item - 1:
+                ndarray[ind, :] = (
+                    (val[0], 1.0) if num_values_per_item == 2 else (val[0], val[1], 1.0)  # noqa: PLR2004
+                )
+            else:
+                raise ValueError(
+                    "Each item must be a float or a tuple of "
+                    f"{num_values_per_item} or {num_values_per_item - 1} "
+                    "elements."
+                )
+
+        return ndarray
+
+    @classmethod
+    def _merge_sorted_times_with_weights(
+        cls, weighted_times: np.ndarray, threshold: float | None
+    ) -> np.ndarray:
+        # First remove any exact duplicate times and sum their weights
+        unique_times, inverse = np.unique(weighted_times[:, 0], return_inverse=True)
+        unique_weights = np.zeros_like(unique_times, dtype=float)
+        np.add.at(unique_weights, inverse, weighted_times[:, 1])
+        weighted_times = np.column_stack((unique_times, unique_weights))
+
+        if threshold is None or threshold <= 0.0:
+            return weighted_times
+
+        # Loop to merge times within threshold until no more merges possible
+        # (loop due to possible overlapping merge groups)
+        while weighted_times.shape[0] > 1:
+            times = weighted_times[:, 0]
+            weights = weighted_times[:, 1]
+
+            # Find diffs between consecutive times, create mask for within-threshold
+            diffs = np.diff(times)
+            within = diffs <= threshold
+            if not np.any(within):
+                break
+
+            # Start indices are marked by the transition from 0 to 1 in the mask
+            # End indices found by searching for last time within threshold from start
+            starts = np.where(np.diff(np.insert(within.astype(int), 0, 0)) == 1)[0]
+            start_end_times = times[starts] + threshold
+            ends = np.searchsorted(times, start_end_times, side="right") - 1
+
+            # Collapse overlapping or chained merge groups
+            if len(starts) > 1:
+                valid_mask = np.concatenate([[True], starts[1:] > ends[:-1]])
+                starts, ends = starts[valid_mask], ends[valid_mask]
+
+            # Update weights at start indices to sum of merged weights
+            cumsum = np.concatenate(([0.0], np.cumsum(weights)))
+            weighted_times[starts, 1] = cumsum[ends + 1] - cumsum[starts]
+
+            # Calculate vectorized mask for removing merged entries
+            merged_events = np.zeros(len(weighted_times) + 1, dtype=int)
+            np.add.at(merged_events, starts, 1)
+            np.add.at(merged_events, ends + 1, -1)
+            remove_mask = np.cumsum(merged_events[:-1]) > 0
+            remove_mask[starts] = False  # Keep start indices
+
+            # Remove merged entries, update weighted_times
+            weights = weights[~remove_mask]
+            times = times[~remove_mask]
+            weighted_times = np.column_stack((times, weights))
+
+        return weighted_times
+
+
+class StatusDistributionSummary(
+    StatusBreakdown[
+        DistributionSummary,
+        DistributionSummary,
+        DistributionSummary,
+        DistributionSummary,
+    ]
+):
+    """
+    Distribution summaries broken down by request status categories.
+
+    Provides separate statistical analysis for successful, incomplete, and errored
+    requests with total aggregate statistics. Enables status-aware performance analysis
+    and SLO validation across different request outcomes in benchmark results.
+    """
+
+    @property
+    def count(self) -> int:
+        """
+        :return: Total count of samples across all status categories
+        """
+        return self.total.count
+
+    @property
+    def total_sum(self) -> float:
+        """
+        :return: Total sum of values across all status categories
+        """
+        return self.total.total_sum
+
+    @classmethod
+    def from_values(
+        cls,
+        successful: Sequence[float | tuple[float, float]] | np.ndarray,
+        incomplete: Sequence[float | tuple[float, float]] | np.ndarray,
+        errored: Sequence[float | tuple[float, float]] | np.ndarray,
+        include_pdf: bool | int = False,
+        epsilon: float = 1e-6,
+    ) -> StatusDistributionSummary:
+        """
+        Create status-broken-down distribution from values by status category.
+
+        :param successful: Values or (value, weight) tuples for successful requests
+        :param incomplete: Values or (value, weight) tuples for incomplete requests
+        :param errored: Values or (value, weight) tuples for errored requests
+        :param include_pdf: Whether to include PDF; True for full, int for sampled size
+        :param epsilon: Tolerance for probability validation
+        :return: Status breakdown of distribution summaries
+        """
+        total, successful_arr, incomplete_arr, errored_arr = cls._combine_status_arrays(
+            successful, incomplete, errored, num_values_per_item=2
+        )
+
+        return StatusDistributionSummary(
+            total=DistributionSummary.from_values(
+                total, include_pdf=include_pdf, epsilon=epsilon
+            ),
+            successful=DistributionSummary.from_values(
+                successful_arr, include_pdf=include_pdf, epsilon=epsilon
+            ),
+            incomplete=DistributionSummary.from_values(
+                incomplete_arr, include_pdf=include_pdf, epsilon=epsilon
+            ),
+            errored=DistributionSummary.from_values(
+                errored_arr, include_pdf=include_pdf, epsilon=epsilon
+            ),
+        )
+
+    @classmethod
+    def from_values_function(
+        cls,
+        function: Callable[
+            [FunctionObjT],
+            float | tuple[float, float] | Sequence[float | tuple[float, float]] | None,
+        ],
+        successful: Sequence[FunctionObjT],
+        incomplete: Sequence[FunctionObjT],
+        errored: Sequence[FunctionObjT],
+        include_pdf: bool | int = False,
+        epsilon: float = 1e-6,
+    ) -> StatusDistributionSummary:
+        """
+        Create distribution summary by extracting values from objects via function.
+
+        :param function: Function to extract value(s) from each object
+        :param successful: Successful request objects
+        :param incomplete: Incomplete request objects
+        :param errored: Errored request objects
+        :param include_pdf: Whether to include PDF; True for full, int for sampled size
+        :param epsilon: Tolerance for probability validation
+        :return: Status breakdown of distribution summaries
+        """
+
+        def _extract_values(
+            _objs: Sequence[FunctionObjT],
+        ) -> Sequence[float | tuple[float, float]]:
+            _outputs: list[float | tuple[float, float]] = []
+            for _obj in _objs:
+                if (_result := function(_obj)) is None:
+                    continue
+                if isinstance(_result, Sequence) and not isinstance(_result, tuple):
+                    _outputs.extend(_result)
+                else:
+                    _outputs.append(_result)
+            return _outputs
+
+        return cls.from_values(
+            successful=_extract_values(successful),
+            incomplete=_extract_values(incomplete),
+            errored=_extract_values(errored),
+            include_pdf=include_pdf,
+            epsilon=epsilon,
+        )
+
+    @classmethod
+    def rate_distribution_from_timings(
+        cls,
+        successful: Sequence[float | tuple[float, float]] | np.ndarray,
+        incomplete: Sequence[float | tuple[float, float]] | np.ndarray,
+        errored: Sequence[float | tuple[float, float]] | np.ndarray,
+        start_time: float | None = None,
+        end_time: float | None = None,
+        threshold: float | None = 1e-4,
+        include_pdf: bool | int = False,
+        epsilon: float = 1e-6,
+    ) -> StatusDistributionSummary:
+        """
+        Create status-broken-down rate distribution from event timestamps.
+
+        :param successful: Timestamps for successful request events
+        :param incomplete: Timestamps for incomplete request events
+        :param errored: Timestamps for errored request events
+        :param start_time: Analysis window start
+        :param end_time: Analysis window end
+        :param threshold: Time threshold for merging nearby events
+        :param include_pdf: Whether to include PDF; True for full, int for sampled size
+        :param epsilon: Tolerance for probability validation
+        :return: Status breakdown of rate distribution summaries
+        """
+        total, successful_arr, incomplete_arr, errored_arr = cls._combine_status_arrays(
+            successful, incomplete, errored, num_values_per_item=2
+        )
+
+        return StatusDistributionSummary(
+            total=DistributionSummary.rate_distribution_from_timings(
+                total,
+                start_time=start_time,
+                end_time=end_time,
+                threshold=threshold,
+                include_pdf=include_pdf,
+                epsilon=epsilon,
+            ),
+            successful=DistributionSummary.rate_distribution_from_timings(
+                successful_arr,
+                start_time=start_time,
+                end_time=end_time,
+                threshold=threshold,
+                include_pdf=include_pdf,
+                epsilon=epsilon,
+            ),
+            incomplete=DistributionSummary.rate_distribution_from_timings(
+                incomplete_arr,
+                start_time=start_time,
+                end_time=end_time,
+                threshold=threshold,
+                include_pdf=include_pdf,
+                epsilon=epsilon,
+            ),
+            errored=DistributionSummary.rate_distribution_from_timings(
+                errored_arr,
+                start_time=start_time,
+                end_time=end_time,
+                threshold=threshold,
+                include_pdf=include_pdf,
+                epsilon=epsilon,
+            ),
+        )
+
+    @classmethod
+    def rate_distribution_from_timings_function(
+        cls,
+        function: Callable[
+            [FunctionObjT],
+            float | tuple[float, float] | Sequence[float | tuple[float, float]] | None,
+        ],
+        successful: Sequence[FunctionObjT],
+        incomplete: Sequence[FunctionObjT],
+        errored: Sequence[FunctionObjT],
+        start_time: float | None = None,
+        end_time: float | None = None,
+        threshold: float | None = 1e-4,
+        include_pdf: bool | int = False,
+        epsilon: float = 1e-6,
+    ) -> StatusDistributionSummary:
+        """
+        Create rate distribution by extracting timestamps from objects via function.
+
+        :param function: Function to extract timestamp(s) from each object
+        :param successful: Successful request objects
+        :param incomplete: Incomplete request objects
+        :param errored: Errored request objects
+        :param start_time: Analysis window start
+        :param end_time: Analysis window end
+        :param threshold: Time threshold for merging nearby events
+        :param include_pdf: Whether to include PDF; True for full, int for sampled size
+        :param epsilon: Tolerance for probability validation
+        :return: Status breakdown of rate distribution summaries
+        """
+
+        def _extract_values(
+            _objs: Sequence[FunctionObjT],
+        ) -> Sequence[float | tuple[float, float]]:
+            _outputs: list[float | tuple[float, float]] = []
+            for _obj in _objs:
+                if (_result := function(_obj)) is None:
+                    continue
+                if isinstance(_result, Sequence) and not isinstance(_result, tuple):
+                    _outputs.extend(_result)
+                else:
+                    _outputs.append(_result)
+            return _outputs
+
+        return cls.rate_distribution_from_timings(
+            successful=_extract_values(successful),
+            incomplete=_extract_values(incomplete),
+            errored=_extract_values(errored),
+            start_time=start_time,
+            end_time=end_time,
+            threshold=threshold,
+            include_pdf=include_pdf,
+            epsilon=epsilon,
+        )
+
+    @classmethod
+    def concurrency_distribution_from_timings(
+        cls,
+        successful: Sequence[tuple[float, float] | tuple[float, float, float]]
+        | np.ndarray,
+        incomplete: Sequence[tuple[float, float] | tuple[float, float, float]]
+        | np.ndarray,
+        errored: Sequence[tuple[float, float] | tuple[float, float, float]]
+        | np.ndarray,
+        start_time: float | None = None,
+        end_time: float | None = None,
+        threshold: float | None = 1e-4,
+        include_pdf: bool | int = False,
+        epsilon: float = 1e-6,
+    ) -> StatusDistributionSummary:
+        """
+        Create status-broken-down concurrency distribution from event intervals.
+
+        :param successful: Event intervals for successful requests
+        :param incomplete: Event intervals for incomplete requests
+        :param errored: Event intervals for errored requests
+        :param start_time: Analysis window start
+        :param end_time: Analysis window end
+        :param threshold: Time threshold for merging nearby transitions
+        :param include_pdf: Whether to include PDF; True for full, int for sampled size
+        :param epsilon: Tolerance for probability validation
+        :return: Status breakdown of concurrency distribution summaries
+        """
+        total, successful_arr, incomplete_arr, errored_arr = cls._combine_status_arrays(
+            successful, incomplete, errored, num_values_per_item=3
+        )
+
+        return StatusDistributionSummary(
+            total=DistributionSummary.concurrency_distribution_from_timings(
+                total,
+                start_time=start_time,
+                end_time=end_time,
+                threshold=threshold,
+                include_pdf=include_pdf,
+                epsilon=epsilon,
+            ),
+            successful=DistributionSummary.concurrency_distribution_from_timings(
+                successful_arr,
+                start_time=start_time,
+                end_time=end_time,
+                threshold=threshold,
+                include_pdf=include_pdf,
+                epsilon=epsilon,
+            ),
+            incomplete=DistributionSummary.concurrency_distribution_from_timings(
+                incomplete_arr,
+                start_time=start_time,
+                end_time=end_time,
+                threshold=threshold,
+                include_pdf=include_pdf,
+                epsilon=epsilon,
+            ),
+            errored=DistributionSummary.concurrency_distribution_from_timings(
+                errored_arr,
+                start_time=start_time,
+                end_time=end_time,
+                threshold=threshold,
+                include_pdf=include_pdf,
+                epsilon=epsilon,
+            ),
+        )
+
+    @classmethod
+    def concurrency_distribution_from_timings_function(
+        cls,
+        function: Callable[
+            [FunctionObjT],
+            tuple[float, float]
+            | tuple[float, float, float]
+            | Sequence[tuple[float, float] | tuple[float, float, float]]
+            | None,
+        ],
+        successful: Sequence[FunctionObjT],
+        incomplete: Sequence[FunctionObjT],
+        errored: Sequence[FunctionObjT],
+        start_time: float | None = None,
+        end_time: float | None = None,
+        threshold: float | None = 1e-4,
+        include_pdf: bool | int = False,
+        epsilon: float = 1e-6,
+    ) -> StatusDistributionSummary:
+        """
+        Create concurrency distribution by extracting intervals from objects.
+
+        :param function: Function to extract time interval(s) from each object
+        :param successful: Successful request objects
+        :param incomplete: Incomplete request objects
+        :param errored: Errored request objects
+        :param start_time: Analysis window start
+        :param end_time: Analysis window end
+        :param threshold: Time threshold for merging nearby transitions
+        :param include_pdf: Whether to include PDF; True for full, int for sampled size
+        :param epsilon: Tolerance for probability validation
+        :return: Status breakdown of concurrency distribution summaries
+        """
+
+        def _extract_values(
+            _objs: Sequence[FunctionObjT],
+        ) -> Sequence[tuple[float, float] | tuple[float, float, float]]:
+            _outputs: list[tuple[float, float] | tuple[float, float, float]] = []
+            for _obj in _objs:
+                if (_result := function(_obj)) is None:
+                    continue
+                if isinstance(_result, Sequence) and not isinstance(_result, tuple):
+                    _outputs.extend(_result)
+                else:
+                    _outputs.append(_result)
+            return _outputs
+
+        return cls.concurrency_distribution_from_timings(
+            successful=_extract_values(successful),
+            incomplete=_extract_values(incomplete),
+            errored=_extract_values(errored),
+            start_time=start_time,
+            end_time=end_time,
+            threshold=threshold,
+            include_pdf=include_pdf,
+            epsilon=epsilon,
+        )
+
+    @classmethod
+    def _combine_status_arrays(
+        cls,
+        successful: Sequence[float | tuple[float, float] | tuple[float, float, float]]
+        | np.ndarray,
+        incomplete: Sequence[float | tuple[float, float] | tuple[float, float, float]]
+        | np.ndarray,
+        errored: Sequence[float | tuple[float, float] | tuple[float, float, float]]
+        | np.ndarray,
+        num_values_per_item: Literal[2, 3],
+    ) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+        successful_array = DistributionSummary._to_weighted_ndarray(  # noqa: SLF001
+            successful, num_values_per_item=num_values_per_item
+        )
+        incomplete_array = DistributionSummary._to_weighted_ndarray(  # noqa: SLF001
+            incomplete, num_values_per_item=num_values_per_item
+        )
+        errored_array = DistributionSummary._to_weighted_ndarray(  # noqa: SLF001
+            errored, num_values_per_item=num_values_per_item
+        )
+        total_array = np.concatenate(
+            (successful_array, incomplete_array, errored_array), axis=0
+        )
+        return total_array, successful_array, incomplete_array, errored_array
diff --git a/src/guidellm/schemas/stats.py b/src/guidellm/schemas/stats.py
deleted file mode 100644
index 67f1d26c..00000000
--- a/src/guidellm/schemas/stats.py
+++ /dev/null
@@ -1,228 +0,0 @@
-"""
-Request statistics and metrics for generative AI benchmark analysis.
-
-Provides data structures for capturing and analyzing performance metrics from
-generative AI workloads. Contains request-level statistics including token counts,
-latency measurements, and throughput calculations for text generation benchmarks.
-"""
-
-from __future__ import annotations
-
-from typing import Literal
-
-from pydantic import Field, computed_field
-
-from guidellm.schemas.info import RequestInfo
-from guidellm.schemas.request import GenerativeRequestType, UsageMetrics
-from guidellm.utils import StandardBaseDict
-
-__all__ = ["GenerativeRequestStats"]
-
-
-class GenerativeRequestStats(StandardBaseDict):
-    """
-    Request statistics for generative AI text generation workloads.
-
-    Captures comprehensive performance metrics for individual generative requests,
-    including token counts, timing measurements, and derived performance statistics.
-    Provides computed properties for latency analysis, throughput calculations,
-    and token generation metrics essential for benchmark evaluation.
-
-    Example:
-    ::
-        stats = GenerativeRequestStats(
-            request_id="req_123",
-            request_type="text_completion",
-            info=request_info,
-            input_metrics=input_usage,
-            output_metrics=output_usage
-        )
-        throughput = stats.output_tokens_per_second
-    """
-
-    type_: Literal["generative_request_stats"] = "generative_request_stats"
-    request_id: str = Field(description="Unique identifier for the request")
-    request_type: GenerativeRequestType | str = Field(
-        description="Type of generative request: text or chat completion"
-    )
-    request_args: str | None = Field(
-        default=None, description="Arguments passed to the backend for this request"
-    )
-    output: str | None = Field(
-        description="Generated text output, if request completed successfully"
-    )
-    info: RequestInfo = Field(
-        description="Metadata and timing information for the request"
-    )
-    input_metrics: UsageMetrics = Field(
-        description="Usage statistics for the input prompt"
-    )
-    output_metrics: UsageMetrics = Field(
-        description="Usage statistics for the generated output"
-    )
-
-    # Request stats
-    @computed_field  # type: ignore[misc]
-    @property
-    def request_latency(self) -> float | None:
-        """
-        End-to-end request processing latency in seconds.
-
-        :return: Duration from request start to completion, or None if unavailable.
-        """
-        if not self.info.timings.request_end or not self.info.timings.request_start:
-            return None
-
-        return self.info.timings.request_end - self.info.timings.request_start
-
-    # General token stats
-    @computed_field  # type: ignore[misc]
-    @property
-    def prompt_tokens(self) -> int | None:
-        """
-        Number of tokens in the input prompt.
-
-        :return: Input prompt token count, or None if unavailable.
-        """
-        return self.input_metrics.text_tokens
-
-    @computed_field  # type: ignore[misc]
-    @property
-    def input_tokens(self) -> int | None:
-        """
-        Number of tokens in the input prompt.
-
-        :return: Input prompt token count, or None if unavailable.
-        """
-        return self.input_metrics.total_tokens
-
-    @computed_field  # type: ignore[misc]
-    @property
-    def output_tokens(self) -> int | None:
-        """
-        Number of tokens in the generated output.
-
-        :return: Generated output token count, or None if unavailable.
-        """
-        return self.output_metrics.total_tokens
-
-    @computed_field  # type: ignore[misc]
-    @property
-    def total_tokens(self) -> int | None:
-        """
-        Total token count including prompt and output tokens.
-
-        :return: Sum of prompt and output tokens, or None if either is unavailable.
-        """
-        input_tokens = self.input_metrics.total_tokens
-        output_tokens = self.output_metrics.total_tokens
-
-        if input_tokens is None and output_tokens is None:
-            return None
-
-        return (input_tokens or 0) + (output_tokens or 0)
-
-    @computed_field  # type: ignore[misc]
-    @property
-    def time_to_first_token_ms(self) -> float | None:
-        """
-        Time to first token generation in milliseconds.
-
-        :return: Latency from request start to first token, or None if unavailable.
-        """
-        if (
-            not self.info.timings.first_iteration
-            or not self.info.timings.request_start
-            or self.info.timings.first_iteration == self.info.timings.last_iteration
-        ):
-            return None
-
-        return 1000 * (
-            self.info.timings.first_iteration - self.info.timings.request_start
-        )
-
-    @computed_field  # type: ignore[misc]
-    @property
-    def time_per_output_token_ms(self) -> float | None:
-        """
-        Average time per output token in milliseconds.
-
-        Includes time for first token and all subsequent tokens.
-
-        :return: Average milliseconds per output token, or None if unavailable.
-        """
-        if (
-            not self.info.timings.request_start
-            or not self.info.timings.last_iteration
-            or not self.output_metrics.total_tokens
-        ):
-            return None
-
-        return (
-            1000
-            * (self.info.timings.last_iteration - self.info.timings.request_start)
-            / self.output_metrics.total_tokens
-        )
-
-    @computed_field  # type: ignore[misc]
-    @property
-    def inter_token_latency_ms(self) -> float | None:
-        """
-        Average inter-token latency in milliseconds.
-
-        Measures time between token generations, excluding first token.
-
-        :return: Average milliseconds between tokens, or None if unavailable.
-        """
-        if (
-            not self.info.timings.first_iteration
-            or not self.info.timings.last_iteration
-            or not self.output_metrics.total_tokens
-            or self.output_metrics.total_tokens <= 1
-        ):
-            return None
-
-        return (
-            1000
-            * (self.info.timings.last_iteration - self.info.timings.first_iteration)
-            / (self.output_metrics.total_tokens - 1)
-        )
-
-    @computed_field  # type: ignore[misc]
-    @property
-    def tokens_per_second(self) -> float | None:
-        """
-        Overall token throughput including prompt and output tokens.
-
-        :return: Total tokens per second, or None if unavailable.
-        """
-        if not (latency := self.request_latency) or self.total_tokens is None:
-            return None
-
-        return self.total_tokens / latency
-
-    @computed_field  # type: ignore[misc]
-    @property
-    def output_tokens_per_second(self) -> float | None:
-        """
-        Output token generation throughput.
-
-        :return: Output tokens per second, or None if unavailable.
-        """
-        if not (latency := self.request_latency) or self.output_tokens is None:
-            return None
-
-        return self.output_tokens / latency
-
-    @computed_field  # type: ignore[misc]
-    @property
-    def output_tokens_per_iteration(self) -> float | None:
-        """
-        Average output tokens generated per iteration.
-
-        :return: Output tokens per iteration, or None if unavailable.
-        """
-        if self.output_tokens is None or not self.info.timings.iterations:
-            return None
-
-        return self.output_tokens / self.info.timings.iterations
diff --git a/src/guidellm/settings.py b/src/guidellm/settings.py
index f03b19e2..12c8ef30 100644
--- a/src/guidellm/settings.py
+++ b/src/guidellm/settings.py
@@ -162,7 +162,7 @@ class Settings(BaseSettings):
     preferred_output_tokens_source: Literal["request", "response"] = "response"
     preferred_backend: Literal["openai"] = "openai"
     preferred_route: Literal["text_completions", "chat_completions"] = (
-        "text_completions"
+        "chat_completions"
     )
     openai: OpenAISettings = OpenAISettings()
 
diff --git a/src/guidellm/utils/__init__.py b/src/guidellm/utils/__init__.py
index 89312771..0874c291 100644
--- a/src/guidellm/utils/__init__.py
+++ b/src/guidellm/utils/__init__.py
@@ -13,6 +13,7 @@
     all_defined,
     safe_add,
     safe_divide,
+    safe_format_number,
     safe_format_timestamp,
     safe_getattr,
     safe_multiply,
@@ -28,23 +29,9 @@
     SendMessageT,
 )
 from .mixins import InfoMixin
-from .pydantic_utils import (
-    PydanticClassRegistryMixin,
-    ReloadableBaseModel,
-    StandardBaseDict,
-    StandardBaseModel,
-    StatusBreakdown,
-)
 from .random import IntegerRangeSampler
 from .registry import RegistryMixin, RegistryObjT
 from .singleton import SingletonMixin, ThreadSafeSingletonMixin
-from .statistics import (
-    DistributionSummary,
-    Percentiles,
-    RunningStats,
-    StatusDistributionSummary,
-    TimeRunningStats,
-)
 from .synchronous import (
     wait_for_sync_barrier,
     wait_for_sync_event,
@@ -67,11 +54,9 @@
     "SUPPORTED_TYPES",
     "AutoImporterMixin",
     "Colors",
-    "Colors",
     "Console",
     "ConsoleUpdateStep",
     "DefaultGroupHandler",
-    "DistributionSummary",
     "Encoder",
     "EncodingTypesAlias",
     "EndlessTextCreator",
@@ -82,25 +67,15 @@
     "InterProcessMessagingPipe",
     "InterProcessMessagingQueue",
     "MessageEncoding",
-    "MessageEncoding",
-    "Percentiles",
-    "PydanticClassRegistryMixin",
     "RegistryMixin",
     "RegistryObjT",
-    "ReloadableBaseModel",
-    "RunningStats",
     "SendMessageT",
     "SerializationTypesAlias",
     "Serializer",
     "SingletonMixin",
-    "StandardBaseDict",
-    "StandardBaseModel",
-    "StatusBreakdown",
-    "StatusDistributionSummary",
     "StatusIcons",
     "StatusStyles",
     "ThreadSafeSingletonMixin",
-    "TimeRunningStats",
     "all_defined",
     "camelize_str",
     "check_load_processor",
@@ -114,6 +89,7 @@
     "recursive_key_update",
     "safe_add",
     "safe_divide",
+    "safe_format_number",
     "safe_format_timestamp",
     "safe_getattr",
     "safe_multiply",
diff --git a/src/guidellm/utils/console.py b/src/guidellm/utils/console.py
index 54e90cf7..bdb2da86 100644
--- a/src/guidellm/utils/console.py
+++ b/src/guidellm/utils/console.py
@@ -1,8 +1,18 @@
+"""
+Console utilities for rich terminal output and status updates.
+
+Provides an extended Rich console with custom formatting for status messages,
+progress tracking, and tabular data display. Includes predefined color schemes,
+status levels, icons, and styles for consistent terminal output across the
+application. Supports multi-step operations with spinners and context managers
+for clean progress reporting.
+"""
+
 from __future__ import annotations
 
-from collections.abc import Mapping
+from collections.abc import Mapping, Sequence
 from dataclasses import dataclass
-from typing import Any, Literal
+from typing import Annotated, Any, Literal
 
 from rich.console import Console as RichConsole
 from rich.padding import Padding
@@ -14,11 +24,41 @@
     "Console",
     "ConsoleUpdateStep",
     "StatusIcons",
+    "StatusLevel",
     "StatusStyles",
 ]
 
+StatusLevel = Annotated[
+    Literal[
+        "debug",
+        "info",
+        "warning",
+        "error",
+        "critical",
+        "notset",
+        "success",
+    ],
+    "Status level for console messages indicating severity or state",
+]
+
 
 class Colors:
+    """
+    Color constants for console styling.
+
+    Provides standardized color schemes for different message types and branding.
+    Colors are defined using Rich console color names or hex values.
+
+    :cvar info: Color for informational messages
+    :cvar progress: Color for progress indicators
+    :cvar success: Color for successful operations
+    :cvar warning: Color for warning messages
+    :cvar error: Color for error messages
+    :cvar primary: Primary brand color
+    :cvar secondary: Secondary brand color
+    :cvar tertiary: Tertiary brand color
+    """
+
     # Core states
     info: str = "light_steel_blue"
     progress: str = "dark_slate_gray1"
@@ -32,7 +72,10 @@ class Colors:
     tertiary: str = "#008080"
 
 
-StatusIcons: Mapping[str, str] = {
+StatusIcons: Annotated[
+    Mapping[str, str],
+    "Mapping of status levels to unicode icon characters for visual indicators",
+] = {
     "debug": "…",
     "info": "ℹ",
     "warning": "⚠",
@@ -42,7 +85,10 @@ class Colors:
     "success": "✔",
 }
 
-StatusStyles: Mapping[str, str] = {
+StatusStyles: Annotated[
+    Mapping[str, str],
+    "Mapping of status levels to Rich console style strings for colored output",
+] = {
     "debug": "dim",
     "info": f"bold {Colors.info}",
     "warning": f"bold {Colors.warning}",
@@ -55,95 +101,119 @@ class Colors:
 
 @dataclass
 class ConsoleUpdateStep:
+    """
+    Context manager for multi-step progress operations with spinner.
+
+    Displays animated spinner during operation execution and allows dynamic
+    status updates. Automatically stops spinner on exit and prints final
+    status message. Designed for use with Python's `with` statement.
+
+    Example:
+    ::
+        console = Console()
+        with console.print_update_step("Processing data") as step:
+            step.update("Loading files", "info")
+            # ... do work ...
+            step.finish("Completed successfully", status_level="success")
+
+    :param console: The Console instance to use for output
+    :param title: Initial progress message to display
+    :param details: Optional additional details to show after completion
+    :param status_level: Initial status level determining style and icon
+    :param spinner: Spinner animation style name from Rich's spinner set
+    """
+
     console: Console
     title: str
     details: Any | None = None
-    status_level: Literal[
-        "debug",
-        "info",
-        "warning",
-        "error",
-        "critical",
-        "notset",
-        "success",
-    ] = "info"
+    status_level: StatusLevel = "info"
     spinner: str = "dots"
     _status: Status | None = None
 
-    def __enter__(self):
+    def __enter__(self) -> ConsoleUpdateStep:
         if self.console.quiet:
             return self
 
+        style = StatusStyles.get(self.status_level, "bold")
         self._status = self.console.status(
-            f"[{StatusStyles.get(self.status_level, 'bold')}]{self.title}[/]",
+            f"[{style}]{self.title}[/]",
             spinner=self.spinner,
         )
         self._status.__enter__()
         return self
 
-    def update(
-        self,
-        title: str,
-        status_level: Literal[
-            "debug",
-            "info",
-            "warning",
-            "error",
-            "critical",
-            "notset",
-            "success",
-        ]
-        | None = None,
-    ):
+    def update(self, title: str, status_level: StatusLevel | None = None):
+        """
+        Update the progress message and optionally the status level.
+
+        :param title: New progress message to display
+        :param status_level: Optional new status level to apply
+        """
         self.title = title
         if status_level is not None:
             self.status_level = status_level
+
         if self._status:
-            self._status.update(
-                status=f"[{StatusStyles.get(self.status_level, 'bold')}]{title}[/]"
-            )
+            style = StatusStyles.get(self.status_level, "bold")
+            self._status.update(status=f"[{style}]{title}[/]")
 
     def finish(
         self,
         title: str,
         details: Any | None = None,
-        status_level: Literal[
-            "debug",
-            "info",
-            "warning",
-            "error",
-            "critical",
-            "notset",
-            "success",
-        ] = "info",
+        status_level: StatusLevel = "info",
     ):
+        """
+        Stop the spinner and print the final status message.
+
+        :param title: Final completion message to display
+        :param details: Optional additional information to show below message
+        :param status_level: Status level for final message styling
+        """
         self.title = title
         self.status_level = status_level
+
         if self._status:
             self._status.stop()
+
         self.console.print_update(title, details, status_level)
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         if self._status:
-            return self._status.__exit__(exc_type, exc_val, exc_tb)
-        return False
+            self._status.__exit__(exc_type, exc_val, exc_tb)
 
 
 class Console(RichConsole):
+    """
+    Extended Rich console with custom formatting and status reporting.
+
+    Enhances Rich's Console with specialized methods for status messages,
+    progress tracking with spinners, and formatted table output. Provides
+    consistent styling through predefined status levels, icons, and colors.
+    Supports quiet mode to suppress non-critical output.
+
+    Example:
+    ::
+        console = Console()
+        console.print_update("Starting process", status="info")
+        with console.print_update_step("Loading data") as step:
+            step.update("Processing items")
+            step.finish("Complete", status_level="success")
+    """
+
     def print_update(
         self,
         title: str,
-        details: str | None = None,
-        status: Literal[
-            "debug",
-            "info",
-            "warning",
-            "error",
-            "critical",
-            "notset",
-            "success",
-        ] = "info",
-    ) -> None:
+        details: Any | None = None,
+        status: StatusLevel = "info",
+    ):
+        """
+        Print a status message with icon and optional details.
+
+        :param title: Main status message to display
+        :param details: Optional additional details shown indented below message
+        :param status: Status level determining icon and styling
+        """
         icon = StatusIcons.get(status, "•")
         style = StatusStyles.get(status, "bold")
         line = Text.assemble(f"{icon} ", (title, style))
@@ -151,6 +221,11 @@ def print_update(
         self.print_update_details(details)
 
     def print_update_details(self, details: Any | None):
+        """
+        Print additional details indented below a status message.
+
+        :param details: Content to display, converted to string and styled dimly
+        """
         if details:
             block = Padding(
                 Text.from_markup(str(details)),
@@ -162,18 +237,19 @@ def print_update_details(self, details: Any | None):
     def print_update_step(
         self,
         title: str,
-        status: Literal[
-            "debug",
-            "info",
-            "warning",
-            "error",
-            "critical",
-            "notset",
-            "success",
-        ] = "info",
+        status: StatusLevel = "info",
         details: Any | None = None,
         spinner: str = "dots",
     ) -> ConsoleUpdateStep:
+        """
+        Create a context manager for multi-step progress with spinner.
+
+        :param title: Initial progress message to display
+        :param status: Initial status level for styling
+        :param details: Optional details to show after completion
+        :param spinner: Spinner animation style name
+        :return: ConsoleUpdateStep context manager for progress tracking
+        """
         return ConsoleUpdateStep(
             console=self,
             title=title,
@@ -181,3 +257,310 @@ def print_update_step(
             status_level=status,
             spinner=spinner,
         )
+
+    def print_tables(
+        self,
+        header_cols_groups: Sequence[Sequence[str | list[str]]],
+        value_cols_groups: Sequence[Sequence[str | list[str]]],
+        title: str | None = None,
+        widths: Sequence[int] | None = None,
+    ):
+        """
+        Print multiple tables with uniform column widths.
+
+        :param header_cols_groups: List of header column groups for each table
+        :param value_cols_groups: List of value column groups for each table
+        :param title: Optional title to display before tables
+        :param widths: Optional minimum column widths to enforce
+        """
+        if title is not None:
+            self.print_update(title, None, "info")
+
+        # Format all groups to determine uniform widths
+        widths = widths or None
+        headers = []
+        values = []
+
+        # Process all tables to get consistent widths
+        for value_cols in value_cols_groups:
+            formatted, widths = self._format_table_columns(value_cols, widths)
+            values.append(formatted)
+        for header_cols in header_cols_groups:
+            formatted, widths = self._format_table_headers(header_cols, widths)
+            headers.append(formatted)
+
+        # Print each table
+        for ind, (header, value) in enumerate(zip(headers, values, strict=False)):
+            is_last = ind == len(headers) - 1
+            self.print_table(
+                header,
+                value,
+                widths=widths,
+                apply_formatting=False,
+                print_bottom_divider=is_last,
+            )
+
+    def print_table(
+        self,
+        header_cols: Sequence[str | list[str]],
+        value_cols: Sequence[str | list[str]],
+        title: str | None = None,
+        widths: Sequence[int] | None = None,
+        apply_formatting: bool = True,
+        print_bottom_divider: bool = True,
+    ):
+        """
+        Print a formatted table with headers and values.
+
+        :param header_cols: List of header columns, each string or list of strings
+        :param value_cols: List of value columns, each string or list of strings
+        :param title: Optional title to display before table
+        :param widths: Optional minimum column widths to enforce
+        :param apply_formatting: Whether to calculate widths and format columns
+        :param print_bottom_divider: Whether to print bottom border line
+        """
+        if title is not None:
+            self.print_update(title, None, "info")
+
+        # Format data
+        values: list[list[str]]
+        headers: list[list[str]]
+        final_widths: list[int]
+
+        if apply_formatting:
+            values, final_widths = self._format_table_columns(value_cols, widths)
+            headers, final_widths = self._format_table_headers(
+                header_cols, final_widths
+            )
+        else:
+            values = [col if isinstance(col, list) else [col] for col in value_cols]
+            headers = [col if isinstance(col, list) else [col] for col in header_cols]
+            final_widths = list(widths) if widths else []
+
+        # Print table structure
+        self.print_table_divider(final_widths, "=")
+        self.print_table_headers(headers, final_widths)
+        self.print_table_divider(final_widths, "-")
+        self.print_table_values(values, final_widths)
+
+        if print_bottom_divider:
+            self.print_table_divider(final_widths, "=")
+
+    def print_table_divider(self, widths: Sequence[int], char: str):
+        """
+        Print a horizontal divider line across table columns.
+
+        :param widths: Column widths for divider line
+        :param char: Character to use for divider line (e.g., '=', '-')
+        """
+        self.print_table_row(
+            [""] * len(widths),
+            widths=widths,
+            spacer=char,
+            cell_style="bold",
+            divider_style="bold",
+            edge_style="bold",
+        )
+
+    def print_table_headers(self, headers: Sequence[list[str]], widths: Sequence[int]):
+        """
+        Print header rows with support for column spanning.
+
+        :param headers: List of header columns, each containing header row values
+        :param widths: Column widths for proper alignment
+        """
+        if not headers or not headers[0]:
+            return
+
+        for row_idx in range(len(headers[0])):
+            # Calculate widths for this header row, accounting for merged cells.
+            row_widths = list(widths)
+            for col_idx in range(len(headers)):
+                if not headers[col_idx][row_idx]:
+                    continue
+
+                # Find span end
+                span_end = col_idx + 1
+                while span_end < len(headers) and not headers[span_end][row_idx]:
+                    row_widths[span_end] = 0
+                    span_end += 1
+
+                # Set combined width for the first cell in span
+                row_widths[col_idx] = sum(
+                    widths[col] for col in range(col_idx, span_end)
+                )
+
+            # Print the header row
+            self.print_table_row(
+                values=[headers[col][row_idx] for col in range(len(headers))],
+                widths=row_widths,
+                cell_style="bold",
+                divider_style="bold",
+                edge_style="bold",
+            )
+
+    def print_table_values(self, values: Sequence[list[str]], widths: Sequence[int]):
+        """
+        Print all data rows in the table.
+
+        :param values: List of value columns, each containing row values
+        :param widths: Column widths for proper alignment
+        """
+        if not values:
+            return
+
+        for row_idx in range(len(values[0])):
+            # Print the value row
+            self.print_table_row(
+                values=[values[col][row_idx] for col in range(len(values))],
+                widths=widths,
+                divider="|",
+                edge_style="bold",
+            )
+
+    def print_table_row(
+        self,
+        values: Sequence[str],
+        widths: Sequence[int] | None = None,
+        spacer: str = " ",
+        divider: str = "|",
+        cell_style: str = "",
+        value_style: str = "",
+        divider_style: str = "",
+        edge_style: str = "",
+    ):
+        """
+        Print a single table row with custom styling.
+
+        :param values: Cell values for the row
+        :param widths: Column widths, defaults to value lengths
+        :param spacer: Character for padding cells
+        :param divider: Character separating columns
+        :param cell_style: Rich style string for entire cells
+        :param value_style: Rich style string for cell values only
+        :param divider_style: Rich style string for column dividers
+        :param edge_style: Rich style string for table edges
+        """
+        widths = widths or [len(val) for val in values]
+
+        # Build styled cells
+        cells = []
+        for val, width in zip(values, widths, strict=True):
+            cell = val.ljust(width, spacer)
+            if value_style and val:
+                cell = cell.replace(val, f"[{value_style}]{val}[/{value_style}]")
+            if cell_style:
+                cell = f"[{cell_style}]{cell}[/{cell_style}]"
+            cells.append(cell)
+
+        # Build and print row
+        edge = f"[{edge_style}]{divider}[/{edge_style}]" if edge_style else divider
+        inner = (
+            f"[{divider_style}]{divider}[/{divider_style}]"
+            if divider_style
+            else divider
+        )
+        line = edge + inner.join(cells) + edge
+        self.print(line, overflow="ignore", crop=False)
+
+    def _format_table_headers(
+        self,
+        headers: Sequence[str | list[str]],
+        col_widths: Sequence[int] | None = None,
+        spacer: str = " ",
+        min_padding: int = 1,
+    ) -> tuple[list[list[str]], list[int]]:
+        formatted, header_widths = self._format_table_columns(
+            headers, col_widths, spacer, min_padding
+        )
+
+        if not formatted or not formatted[0]:
+            return formatted, []
+
+        # Merge identical adjacent headers row by row
+        widths = list(col_widths) if col_widths else header_widths
+        for row_idx in range(len(formatted[0])):
+            last_value = None
+            start_col = -1
+
+            for col_idx in range(len(formatted) + 1):
+                cur_value = (
+                    formatted[col_idx][row_idx] if col_idx < len(formatted) else None
+                )
+
+                # Check if we should continue merging
+                if (
+                    col_idx < len(formatted)
+                    and cur_value != ""
+                    and cur_value == last_value
+                    and (
+                        row_idx == 0
+                        or headers[start_col][row_idx - 1]
+                        == headers[col_idx][row_idx - 1]
+                    )
+                ):
+                    continue
+
+                # Finalize previous
+                if start_col >= 0:
+                    # Clear merged cells to keep only the first
+                    for col in range(start_col + 1, col_idx):
+                        formatted[col][row_idx] = ""
+
+                    # Adjust widths of columns in the merged span, if needed
+                    if (required := len(formatted[start_col][row_idx])) > (
+                        current := sum(widths[col] for col in range(start_col, col_idx))
+                    ):
+                        diff = required - current
+                        cols_count = col_idx - start_col
+                        per_col = diff // cols_count
+                        extra = diff % cols_count
+
+                        for col in range(start_col, col_idx):
+                            widths[col] += per_col
+                            if extra > 0:
+                                widths[col] += 1
+                                extra -= 1
+
+                # Start new merge
+                last_value = cur_value
+                start_col = col_idx
+
+        return formatted, widths
+
+    def _format_table_columns(
+        self,
+        columns: Sequence[str | list[str]],
+        col_widths: Sequence[int] | None = None,
+        spacer: str = " ",
+        min_padding: int = 1,
+    ) -> tuple[list[list[str]], list[int]]:
+        if not columns:
+            return [], []
+
+        # Normalize to list of lists
+        max_rows = max(len(col) if isinstance(col, list) else 1 for col in columns)
+
+        formatted = []
+        for col in columns:
+            col_list = col if isinstance(col, list) else [col]
+            # Pad to max height
+            col_list = col_list + [""] * (max_rows - len(col_list))
+            # Add cell padding
+            padding = spacer * min_padding
+            col_list = [
+                f"{padding}{item}{padding}" if item else "" for item in col_list
+            ]
+            formatted.append(col_list)
+
+        # Calculate widths
+        widths = [max(len(row) for row in col) for col in formatted]
+
+        # Apply minimum widths if provided
+        if col_widths is not None:
+            widths = [
+                max(width, min_w)
+                for width, min_w in zip(widths, col_widths, strict=True)
+            ]
+
+        return formatted, widths
diff --git a/src/guidellm/utils/functions.py b/src/guidellm/utils/functions.py
index ed4a2075..0633616c 100644
--- a/src/guidellm/utils/functions.py
+++ b/src/guidellm/utils/functions.py
@@ -15,6 +15,7 @@
     "all_defined",
     "safe_add",
     "safe_divide",
+    "safe_format_number",
     "safe_format_timestamp",
     "safe_getattr",
     "safe_multiply",
@@ -115,7 +116,7 @@ def safe_add(
 
 
 def safe_format_timestamp(
-    timestamp: float | None, format_: str = "%H:%M:%S", default: str = "N/A"
+    timestamp: float | int | None, format_: str = "%H:%M:%S", default: str = "N/A"
 ) -> str:
     """
     Safely format a timestamp with error handling and validation.
@@ -132,3 +133,27 @@ def safe_format_timestamp(
         return datetime.fromtimestamp(timestamp).strftime(format_)
     except (ValueError, OverflowError, OSError):
         return default
+
+
+def safe_format_number(
+    number: int | float | None, precision: int = 1, default: str = "--"
+) -> str:
+    """
+    Safely format a number with specified precision and default handling.
+
+    :param number: Number to format, or None
+    :param precision: Number of decimal places for formatting floats
+    :param default: Value to return if number is None
+    :return: Formatted number string or default value
+    """
+    if number is None:
+        return default
+
+    if isinstance(number, int):
+        return str(number)
+
+    try:
+        format_str = f"{{:.{precision}f}}"
+        return format_str.format(number)
+    except (ValueError, TypeError):
+        return default
diff --git a/src/guidellm/utils/statistics.py b/src/guidellm/utils/statistics.py
deleted file mode 100644
index a8403c72..00000000
--- a/src/guidellm/utils/statistics.py
+++ /dev/null
@@ -1,1047 +0,0 @@
-"""
-Statistical analysis utilities for distribution calculations and running metrics.
-
-Provides comprehensive statistical computation tools for analyzing numerical
-distributions, percentiles, and streaming data. Includes specialized support for
-request timing analysis, concurrency measurement, and rate calculations. Integrates
-with Pydantic for serializable statistical models and supports both weighted and
-unweighted distributions with cumulative distribution function (CDF) generation.
-"""
-
-from __future__ import annotations
-
-import math
-import time as timer
-from collections import defaultdict
-from typing import Any, Literal
-
-import numpy as np
-from pydantic import Field, computed_field
-
-from guidellm.utils.pydantic_utils import StandardBaseModel, StatusBreakdown
-
-__all__ = [
-    "DistributionSummary",
-    "Percentiles",
-    "RunningStats",
-    "StatusDistributionSummary",
-    "TimeRunningStats",
-]
-
-
-class Percentiles(StandardBaseModel):
-    """
-    Standard percentiles model for statistical distribution analysis.
-
-    Provides complete percentile coverage from 0.1th to 99.9th percentiles for
-    statistical distribution characterization. Used as a component within
-    DistributionSummary to provide detailed distribution shape analysis.
-    """
-
-    p001: float = Field(
-        description="The 0.1th percentile of the distribution.",
-    )
-    p01: float = Field(
-        description="The 1st percentile of the distribution.",
-    )
-    p05: float = Field(
-        description="The 5th percentile of the distribution.",
-    )
-    p10: float = Field(
-        description="The 10th percentile of the distribution.",
-    )
-    p25: float = Field(
-        description="The 25th percentile of the distribution.",
-    )
-    p50: float = Field(
-        description="The 50th percentile of the distribution.",
-    )
-    p75: float = Field(
-        description="The 75th percentile of the distribution.",
-    )
-    p90: float = Field(
-        description="The 90th percentile of the distribution.",
-    )
-    p95: float = Field(
-        description="The 95th percentile of the distribution.",
-    )
-    p99: float = Field(
-        description="The 99th percentile of the distribution.",
-    )
-    p999: float = Field(
-        description="The 99.9th percentile of the distribution.",
-    )
-
-
-class DistributionSummary(StandardBaseModel):
-    """
-    Comprehensive statistical summary for numerical value distributions.
-
-    Calculates and stores complete statistical metrics including central tendency,
-    dispersion, extremes, and percentiles for any numerical distribution. Supports
-    both weighted and unweighted data with optional cumulative distribution function
-    generation. Primary statistical analysis tool for request timing, performance
-    metrics, and benchmark result characterization.
-
-    Example:
-    ::
-        # Create from simple values
-        summary = DistributionSummary.from_values([1.0, 2.0, 3.0, 4.0, 5.0])
-        print(f"Mean: {summary.mean}, P95: {summary.percentiles.p95}")
-
-        # Create from request timings for concurrency analysis
-        requests = [(0.0, 1.0), (0.5, 2.0), (1.0, 2.5)]
-        concurrency = DistributionSummary.from_request_times(
-            requests, "concurrency"
-        )
-    """
-
-    mean: float = Field(
-        description="The mean/average of the distribution.",
-    )
-    median: float = Field(
-        description="The median of the distribution.",
-    )
-    mode: float = Field(
-        description="The mode of the distribution.",
-    )
-    variance: float = Field(
-        description="The variance of the distribution.",
-    )
-    std_dev: float = Field(
-        description="The standard deviation of the distribution.",
-    )
-    min: float = Field(
-        description="The minimum value of the distribution.",
-    )
-    max: float = Field(
-        description="The maximum value of the distribution.",
-    )
-    count: int = Field(
-        description="The number of values in the distribution.",
-    )
-    total_sum: float = Field(
-        description="The total sum of the values in the distribution.",
-    )
-    percentiles: Percentiles = Field(
-        description="The percentiles of the distribution.",
-    )
-    cumulative_distribution_function: list[tuple[float, float]] | None = Field(
-        description="The cumulative distribution function (CDF) of the distribution.",
-        default=None,
-    )
-
-    @staticmethod
-    def from_distribution_function(
-        distribution: list[tuple[float, float]],
-        include_cdf: bool = False,
-    ) -> DistributionSummary:
-        """
-        Create statistical summary from weighted distribution or probability function.
-
-        Converts weighted numerical values or probability distribution function (PDF)
-        into comprehensive statistical summary. Normalizes weights to probabilities
-        and calculates all statistical metrics including percentiles.
-
-        :param distribution: List of (value, weight) or (value, probability) tuples
-            representing the distribution
-        :param include_cdf: Whether to include cumulative distribution function
-            in the output
-        :return: DistributionSummary instance with calculated statistical metrics
-        """
-        values, weights = zip(*distribution, strict=True) if distribution else ([], [])
-        values = np.array(values)  # type: ignore[assignment]
-        weights = np.array(weights)  # type: ignore[assignment]
-
-        # create the PDF
-        probabilities = weights / np.sum(weights)  # type: ignore[operator]
-        pdf = np.column_stack((values, probabilities))
-        pdf = pdf[np.argsort(pdf[:, 0])]
-        values = pdf[:, 0]  # type: ignore[assignment]
-        probabilities = pdf[:, 1]
-
-        # calculate the CDF
-        cumulative_probabilities = np.cumsum(probabilities)
-        cdf = np.column_stack((values, cumulative_probabilities))
-
-        # calculate statistics
-        mean = np.sum(values * probabilities).item()  # type: ignore[attr-defined]
-        median = cdf[np.argmax(cdf[:, 1] >= 0.5), 0].item() if len(cdf) > 0 else 0  # noqa: PLR2004
-        mode = values[np.argmax(probabilities)].item() if len(values) > 0 else 0  # type: ignore[call-overload]
-        variance = np.sum((values - mean) ** 2 * probabilities).item()  # type: ignore[attr-defined]
-        std_dev = math.sqrt(variance)
-        minimum = values[0].item() if len(values) > 0 else 0
-        maximum = values[-1].item() if len(values) > 0 else 0
-        count = len(values)
-        total_sum = np.sum(values).item()  # type: ignore[attr-defined]
-
-        return DistributionSummary(
-            mean=mean,
-            median=median,
-            mode=mode,
-            variance=variance,
-            std_dev=std_dev,
-            min=minimum,
-            max=maximum,
-            count=count,
-            total_sum=total_sum,
-            percentiles=(
-                Percentiles(
-                    p001=cdf[np.argmax(cdf[:, 1] >= 0.001), 0].item(),  # noqa: PLR2004
-                    p01=cdf[np.argmax(cdf[:, 1] >= 0.01), 0].item(),  # noqa: PLR2004
-                    p05=cdf[np.argmax(cdf[:, 1] >= 0.05), 0].item(),  # noqa: PLR2004
-                    p10=cdf[np.argmax(cdf[:, 1] >= 0.1), 0].item(),  # noqa: PLR2004
-                    p25=cdf[np.argmax(cdf[:, 1] >= 0.25), 0].item(),  # noqa: PLR2004
-                    p50=cdf[np.argmax(cdf[:, 1] >= 0.50), 0].item(),  # noqa: PLR2004
-                    p75=cdf[np.argmax(cdf[:, 1] >= 0.75), 0].item(),  # noqa: PLR2004
-                    p90=cdf[np.argmax(cdf[:, 1] >= 0.9), 0].item(),  # noqa: PLR2004
-                    p95=cdf[np.argmax(cdf[:, 1] >= 0.95), 0].item(),  # noqa: PLR2004
-                    p99=cdf[np.argmax(cdf[:, 1] >= 0.99), 0].item(),  # noqa: PLR2004
-                    p999=cdf[np.argmax(cdf[:, 1] >= 0.999), 0].item(),  # noqa: PLR2004
-                )
-                if len(cdf) > 0
-                else Percentiles(
-                    p001=0,
-                    p01=0,
-                    p05=0,
-                    p10=0,
-                    p25=0,
-                    p50=0,
-                    p75=0,
-                    p90=0,
-                    p95=0,
-                    p99=0,
-                    p999=0,
-                )
-            ),
-            cumulative_distribution_function=cdf.tolist() if include_cdf else None,
-        )
-
-    @staticmethod
-    def from_values(
-        values: list[float],
-        weights: list[float] | None = None,
-        include_cdf: bool = False,
-    ) -> DistributionSummary:
-        """
-        Create statistical summary from numerical values with optional weights.
-
-        Wrapper around from_distribution_function for simple value lists. If weights
-        are not provided, all values are equally weighted. Enables statistical
-        analysis of any numerical dataset.
-
-        :param values: Numerical values representing the distribution
-        :param weights: Optional weights for each value. If not provided, all values
-            are equally weighted
-        :param include_cdf: Whether to include cumulative distribution function in
-            the output DistributionSummary
-        :return: DistributionSummary instance with calculated statistical metrics
-        :raises ValueError: If values and weights lists have different lengths
-        """
-        if weights is None:
-            weights = [1.0] * len(values)
-
-        if len(values) != len(weights):
-            raise ValueError(
-                "The length of values and weights must be the same.",
-            )
-
-        return DistributionSummary.from_distribution_function(
-            distribution=list(zip(values, weights, strict=True)),
-            include_cdf=include_cdf,
-        )
-
-    @staticmethod
-    def from_request_times(
-        requests: list[tuple[float, float]],
-        distribution_type: Literal["concurrency", "rate"],
-        weights: list[float] | None = None,
-        include_cdf: bool = False,
-        epsilon: float = 1e-6,
-    ) -> DistributionSummary:
-        """
-        Create statistical summary from request timing data.
-
-        Analyzes request start/end times to calculate concurrency or rate
-        distributions. Converts timing events into statistical metrics for
-        performance analysis and load characterization.
-
-        :param requests: List of (start_time, end_time) tuples for each request
-        :param distribution_type: Type of analysis - "concurrency" for simultaneous
-            requests or "rate" for completion rates
-        :param include_cdf: Whether to include cumulative distribution function
-        :param epsilon: Threshold for merging close timing events
-        :return: DistributionSummary with timing-based statistical metrics
-        :raises ValueError: If distribution_type is not "concurrency" or "rate"
-        """
-        if not weights:
-            weights = [1.0] * len(requests)
-
-        if len(requests) != len(weights):
-            raise ValueError(
-                "The length of requests and weights must be the same.",
-            )
-
-        # First convert to timing events based on type
-        events = DistributionSummary._convert_to_timing_events(
-            requests, distribution_type, weights
-        )
-
-        # Combine any events within epsilon of each other for stability
-        flattened_events = DistributionSummary._combine_events(events, epsilon)
-
-        # Convert events to value distribution function
-        distribution: dict[float, float] = defaultdict(float)
-
-        if distribution_type == "concurrency":
-            # For concurrency, convert to active concurrency over time
-            active = 0.0
-            for ind in range(len(flattened_events)):
-                time, change = flattened_events[ind]
-                active += change
-                flattened_events[ind] = (time, active)
-
-            # Then convert to distribution by weighting each concurrency
-            # by duration to next event (last event is 0 concurrency)
-            for ind in range(len(flattened_events) - 1):
-                time, value = flattened_events[ind]
-                next_time = flattened_events[ind + 1][0]
-                duration = next_time - time
-                distribution[value] += duration
-        elif distribution_type == "rate":
-            # For rate, convert to distribution by converting each value
-            # to a rate (value/duration) weighted by duration from previous
-            # (first event is 0 rate)
-            for ind in range(1, len(flattened_events)):
-                time, value = flattened_events[ind]
-                prev_time = flattened_events[ind - 1][0]
-                duration = time - prev_time
-                rate = value / duration if duration > 0 else 0.0
-                distribution[rate] += duration
-        else:
-            raise ValueError(
-                f"Invalid distribution_type '{distribution_type}'. "
-                "Must be 'concurrency' or 'rate'."
-            )
-
-        return DistributionSummary.from_distribution_function(
-            distribution=sorted(distribution.items()),
-            include_cdf=include_cdf,
-        )
-
-    @staticmethod
-    def _convert_to_timing_events(
-        requests: list[tuple[float, float]],
-        distribution_type: Literal["concurrency", "rate"],
-        weights: list[float],
-    ) -> list[tuple[float, float]]:
-        events: list[tuple[float, float]] = []
-
-        if distribution_type == "concurrency":
-            # For concurrency, each request adds to concurrency at start
-            # and subtracts at end
-            for (start, end), weight in zip(requests, weights, strict=False):
-                events.append((start, weight))
-                events.append((end, -1 * weight))
-        elif distribution_type == "rate":
-            # For rate, each request is added at the end time only
-            global_start = min(start for start, _ in requests) if requests else 0.0
-            events.append((global_start, 0.0))
-            for (_, end), weight in zip(requests, weights, strict=False):
-                events.append((end, weight))
-        else:
-            raise ValueError(
-                f"Invalid distribution_type '{distribution_type}'. "
-                "Must be 'concurrency' or 'rate'."
-            )
-        return events
-
-    @staticmethod
-    def _combine_events(
-        events: list[tuple[float, float]],
-        epsilon: float,
-    ) -> list[tuple[float, float]]:
-        sorted_events = sorted(events, key=lambda event: event[0])
-        flattened_events: list[tuple[float, float]] = (
-            [sorted_events.pop(0)] if sorted_events else []
-        )
-        last_time = flattened_events[0][0] if flattened_events else 0.0
-
-        for time, val in sorted_events:
-            if abs(time - last_time) <= epsilon:
-                last_val = flattened_events[-1][1]
-                flattened_events[-1] = (last_time, last_val + val)
-            else:
-                last_time = time
-                flattened_events.append((time, val))
-        return flattened_events
-
-    @staticmethod
-    def from_iterable_request_times(
-        requests: list[tuple[float, float]],
-        first_iter_times: list[float],
-        iter_counts: list[int],
-        first_iter_counts: list[int] | None = None,
-        include_cdf: bool = False,
-        epsilon: float = 1e-6,
-    ) -> DistributionSummary:
-        """
-        Create statistical summary from iterative request timing data.
-
-        Analyzes autoregressive or streaming requests with multiple iterations
-        between start and end times. Calculates rate distributions based on
-        iteration timing patterns for LLM token generation analysis.
-
-        :param requests: List of (start_time, end_time) tuples for each request
-        :param first_iter_times: Times when first iteration was received for
-            each request
-        :param iter_counts: Total iteration counts for each request from first
-            iteration to end
-        :param first_iter_counts: Iteration counts for first iteration (defaults
-            to 1 for each request)
-        :param include_cdf: Whether to include cumulative distribution function
-        :param epsilon: Threshold for merging close timing events
-        :return: DistributionSummary with iteration rate statistical metrics
-        :raises ValueError: If input lists have mismatched lengths
-        """
-
-        if first_iter_counts is None:
-            first_iter_counts = [1] * len(requests)
-
-        if (
-            len(requests) != len(first_iter_times)
-            or len(requests) != len(iter_counts)
-            or len(requests) != len(first_iter_counts)
-        ):
-            raise ValueError(
-                "requests, first_iter_times, iter_counts, and first_iter_counts must"
-                "be the same length."
-                f"Given {len(requests)}, {len(first_iter_times)}, {len(iter_counts)}, "
-                f"{len(first_iter_counts)}",
-            )
-
-        # first break up the requests into individual iterable events
-        events = defaultdict(int)
-        global_start = min(start for start, _ in requests) if requests else 0
-        global_end = max(end for _, end in requests) if requests else 0
-        events[global_start] = 0
-        events[global_end] = 0
-
-        for (_, end), first_iter, first_iter_count, total_count in zip(
-            requests, first_iter_times, first_iter_counts, iter_counts, strict=True
-        ):
-            events[first_iter] += first_iter_count
-
-            if total_count > 1:
-                iter_latency = (end - first_iter) / (total_count - 1)
-                for ind in range(1, total_count):
-                    events[first_iter + ind * iter_latency] += 1
-
-        # combine any events that are very close together
-        flattened_events: list[tuple[float, int]] = []
-
-        for time, count in sorted(events.items()):
-            last_time, last_count = (
-                flattened_events[-1] if flattened_events else (None, None)
-            )
-
-            if (
-                last_time is not None
-                and last_count is not None
-                and abs(last_time - time) <= epsilon
-            ):
-                flattened_events[-1] = (last_time, last_count + count)
-            else:
-                flattened_events.append((time, count))
-
-        # convert to value distribution function
-        distribution: dict[float, float] = defaultdict(float)
-
-        for ind in range(len(flattened_events) - 1):
-            start_time, count = flattened_events[ind]
-            end_time, _ = flattened_events[ind + 1]
-            duration = end_time - start_time
-            rate = count / duration
-            distribution[rate] += duration
-
-        distribution_list = sorted(distribution.items())
-
-        return DistributionSummary.from_distribution_function(
-            distribution=distribution_list,
-            include_cdf=include_cdf,
-        )
-
-
-class StatusDistributionSummary(
-    StatusBreakdown[
-        DistributionSummary,
-        DistributionSummary,
-        DistributionSummary,
-        DistributionSummary,
-    ]
-):
-    """
-    Status-grouped statistical summary for request processing analysis.
-
-    Provides comprehensive statistical analysis grouped by request status (total,
-    successful, incomplete, errored). Enables performance analysis across different
-    request outcomes for benchmarking and monitoring applications. Each status
-    category maintains complete DistributionSummary metrics.
-
-    Example:
-    ::
-        status_summary = StatusDistributionSummary.from_values(
-            value_types=["successful", "error", "successful"],
-            values=[1.5, 10.0, 2.1]
-        )
-        print(f"Success mean: {status_summary.successful.mean}")
-        print(f"Error rate: {status_summary.errored.count}")
-    """
-
-    @staticmethod
-    def from_values(
-        value_types: list[Literal["successful", "incomplete", "error"]],
-        values: list[float],
-        weights: list[float] | None = None,
-        include_cdf: bool = False,
-    ) -> StatusDistributionSummary:
-        """
-        Create status-grouped statistical summary from values and status types.
-
-        Groups numerical values by request status and calculates complete
-        statistical summaries for each category. Enables performance analysis
-        across different request outcomes.
-
-        :param value_types: Status type for each value ("successful", "incomplete",
-            or "error")
-        :param values: Numerical values representing the distribution
-        :param weights: Optional weights for each value (defaults to equal weighting)
-        :param include_cdf: Whether to include cumulative distribution functions
-        :return: StatusDistributionSummary with statistics grouped by status
-        :raises ValueError: If input lists have mismatched lengths or invalid
-            status types
-        """
-        if any(
-            type_ not in {"successful", "incomplete", "error"} for type_ in value_types
-        ):
-            raise ValueError(
-                "value_types must be one of 'successful', 'incomplete', or 'error'. "
-                f"Got {value_types} instead.",
-            )
-
-        if weights is None:
-            weights = [1.0] * len(values)
-
-        if len(value_types) != len(values) or len(value_types) != len(weights):
-            raise ValueError(
-                "The length of value_types, values, and weights must be the same.",
-            )
-
-        _, successful_values, successful_weights = (
-            zip(*successful, strict=True)
-            if (
-                successful := list(
-                    filter(
-                        lambda val: val[0] == "successful",
-                        zip(value_types, values, weights, strict=True),
-                    )
-                )
-            )
-            else ([], [], [])
-        )
-        _, incomplete_values, incomplete_weights = (
-            zip(*incomplete, strict=True)
-            if (
-                incomplete := list(
-                    filter(
-                        lambda val: val[0] == "incomplete",
-                        zip(value_types, values, weights, strict=True),
-                    )
-                )
-            )
-            else ([], [], [])
-        )
-        _, errored_values, errored_weights = (
-            zip(*errored, strict=True)
-            if (
-                errored := list(
-                    filter(
-                        lambda val: val[0] == "error",
-                        zip(value_types, values, weights, strict=True),
-                    )
-                )
-            )
-            else ([], [], [])
-        )
-
-        return StatusDistributionSummary(
-            total=DistributionSummary.from_values(
-                values,
-                weights,
-                include_cdf=include_cdf,
-            ),
-            successful=DistributionSummary.from_values(
-                successful_values,  # type: ignore[arg-type]
-                successful_weights,  # type: ignore[arg-type]
-                include_cdf=include_cdf,
-            ),
-            incomplete=DistributionSummary.from_values(
-                incomplete_values,  # type: ignore[arg-type]
-                incomplete_weights,  # type: ignore[arg-type]
-                include_cdf=include_cdf,
-            ),
-            errored=DistributionSummary.from_values(
-                errored_values,  # type: ignore[arg-type]
-                errored_weights,  # type: ignore[arg-type]
-                include_cdf=include_cdf,
-            ),
-        )
-
-    @staticmethod
-    def from_request_times(
-        request_types: list[Literal["successful", "incomplete", "error"]],
-        requests: list[tuple[float, float]],
-        distribution_type: Literal["concurrency", "rate"],
-        weights: list[float] | None = None,
-        include_cdf: bool = False,
-        epsilon: float = 1e-6,
-    ) -> StatusDistributionSummary:
-        """
-        Create status-grouped statistical summary from request timing data.
-
-        Analyzes request timings grouped by status to calculate concurrency or
-        rate distributions for each outcome category. Enables comparative
-        performance analysis across successful, incomplete, and errored requests.
-
-        :param request_types: Status type for each request ("successful",
-            "incomplete", or "error")
-        :param requests: List of (start_time, end_time) tuples for each request
-        :param distribution_type: Analysis type - "concurrency" or "rate"
-        :param include_cdf: Whether to include cumulative distribution functions
-        :param epsilon: Threshold for merging close timing events
-        :return: StatusDistributionSummary with timing statistics by status
-        :raises ValueError: If input lists have mismatched lengths or invalid types
-        """
-        if distribution_type not in {"concurrency", "rate"}:
-            raise ValueError(
-                f"Invalid distribution_type '{distribution_type}'. "
-                "Must be 'concurrency' or 'rate'."
-            )
-
-        if any(
-            type_ not in {"successful", "incomplete", "error"}
-            for type_ in request_types
-        ):
-            raise ValueError(
-                "request_types must be one of 'successful', 'incomplete', or 'error'. "
-                f"Got {request_types} instead.",
-            )
-
-        if len(request_types) != len(requests):
-            raise ValueError(
-                "The length of request_types and requests must be the same. "
-                f"Got {len(request_types)} and {len(requests)} instead.",
-            )
-
-        if weights is None:
-            weights = [1.0] * len(requests)
-
-        if len(requests) != len(weights):
-            raise ValueError(
-                "The length of requests and weights must be the same."
-                f"Got {len(requests)} and {len(weights)} instead.",
-            )
-
-        _, successful_requests, successful_weights = (
-            zip(*successful, strict=False)
-            if (
-                successful := list(
-                    filter(
-                        lambda val: val[0] == "successful",
-                        zip(request_types, requests, weights, strict=False),
-                    )
-                )
-            )
-            else ([], [], [])
-        )
-        _, incomplete_requests, incomplete_weights = (
-            zip(*incomplete, strict=False)
-            if (
-                incomplete := list(
-                    filter(
-                        lambda val: val[0] == "incomplete",
-                        zip(request_types, requests, weights, strict=False),
-                    )
-                )
-            )
-            else ([], [], [])
-        )
-        _, errored_requests, errored_weights = (
-            zip(*errored, strict=False)
-            if (
-                errored := list(
-                    filter(
-                        lambda val: val[0] == "error",
-                        zip(request_types, requests, weights, strict=False),
-                    )
-                )
-            )
-            else ([], [], [])
-        )
-
-        return StatusDistributionSummary(
-            total=DistributionSummary.from_request_times(
-                requests,
-                distribution_type=distribution_type,
-                weights=weights,
-                include_cdf=include_cdf,
-                epsilon=epsilon,
-            ),
-            successful=DistributionSummary.from_request_times(
-                successful_requests,  # type: ignore[arg-type]
-                distribution_type=distribution_type,
-                weights=successful_weights,  # type: ignore[arg-type]
-                include_cdf=include_cdf,
-                epsilon=epsilon,
-            ),
-            incomplete=DistributionSummary.from_request_times(
-                incomplete_requests,  # type: ignore[arg-type]
-                distribution_type=distribution_type,
-                weights=incomplete_weights,  # type: ignore[arg-type]
-                include_cdf=include_cdf,
-                epsilon=epsilon,
-            ),
-            errored=DistributionSummary.from_request_times(
-                errored_requests,  # type: ignore[arg-type]
-                distribution_type=distribution_type,
-                weights=errored_weights,  # type: ignore[arg-type]
-                include_cdf=include_cdf,
-                epsilon=epsilon,
-            ),
-        )
-
-    @staticmethod
-    def from_iterable_request_times(
-        request_types: list[Literal["successful", "incomplete", "error"]],
-        requests: list[tuple[float, float]],
-        first_iter_times: list[float],
-        iter_counts: list[int] | None = None,
-        first_iter_counts: list[int] | None = None,
-        include_cdf: bool = False,
-        epsilon: float = 1e-6,
-    ) -> StatusDistributionSummary:
-        """
-        Create status-grouped statistical summary from iterative request timing data.
-
-        Analyzes autoregressive request timings grouped by status to calculate
-        iteration rate distributions for each outcome category. Enables comparative
-        analysis of token generation or streaming response performance across
-        different request statuses.
-
-        :param request_types: Status type for each request ("successful",
-            "incomplete", or "error")
-        :param requests: List of (start_time, end_time) tuples for each request
-        :param first_iter_times: Times when first iteration was received for
-            each request
-        :param iter_counts: Total iteration counts for each request (defaults to 1)
-        :param first_iter_counts: Iteration counts for first iteration (defaults
-            to 1)
-        :param include_cdf: Whether to include cumulative distribution functions
-        :param epsilon: Threshold for merging close timing events
-        :return: StatusDistributionSummary with iteration statistics by status
-        :raises ValueError: If input lists have mismatched lengths or invalid types
-        """
-        if any(
-            type_ not in {"successful", "incomplete", "error"}
-            for type_ in request_types
-        ):
-            raise ValueError(
-                "request_types must be one of 'successful', 'incomplete', or 'error'. "
-                f"Got {request_types} instead.",
-            )
-
-        if iter_counts is None:
-            iter_counts = [1] * len(requests)
-
-        if first_iter_counts is None:
-            first_iter_counts = [1] * len(requests)
-
-        if (
-            len(request_types) != len(requests)
-            or len(requests) != len(first_iter_times)
-            or len(requests) != len(iter_counts)
-            or len(requests) != len(first_iter_counts)
-        ):
-            raise ValueError(
-                "request_types, requests, first_iter_times, iter_counts, and "
-                "first_iter_counts must be the same length."
-                f"Given {len(request_types)}, {len(requests)}, "
-                f"{len(first_iter_times)}, {len(iter_counts)}, "
-                f"{len(first_iter_counts)}",
-            )
-
-        (
-            _,
-            successful_requests,
-            successful_first_iter_times,
-            successful_iter_counts,
-            successful_first_iter_counts,
-        ) = (
-            zip(*successful, strict=True)
-            if (
-                successful := list(
-                    filter(
-                        lambda val: val[0] == "successful",
-                        zip(
-                            request_types,
-                            requests,
-                            first_iter_times,
-                            iter_counts,
-                            first_iter_counts,
-                            strict=True,
-                        ),
-                    )
-                )
-            )
-            else ([], [], [], [], [])
-        )
-        (
-            _,
-            incomplete_requests,
-            incomplete_first_iter_times,
-            incomplete_iter_counts,
-            incomplete_first_iter_counts,
-        ) = (
-            zip(*incomplete, strict=True)
-            if (
-                incomplete := list(
-                    filter(
-                        lambda val: val[0] == "incomplete",
-                        zip(
-                            request_types,
-                            requests,
-                            first_iter_times,
-                            iter_counts,
-                            first_iter_counts,
-                            strict=True,
-                        ),
-                    )
-                )
-            )
-            else ([], [], [], [], [])
-        )
-        (
-            _,
-            errored_requests,
-            errored_first_iter_times,
-            errored_iter_counts,
-            errored_first_iter_counts,
-        ) = (
-            zip(*errored, strict=True)
-            if (
-                errored := list(
-                    filter(
-                        lambda val: val[0] == "error",
-                        zip(
-                            request_types,
-                            requests,
-                            first_iter_times,
-                            iter_counts,
-                            first_iter_counts,
-                            strict=True,
-                        ),
-                    )
-                )
-            )
-            else ([], [], [], [], [])
-        )
-
-        return StatusDistributionSummary(
-            total=DistributionSummary.from_iterable_request_times(
-                requests,
-                first_iter_times,
-                iter_counts,
-                first_iter_counts,
-                include_cdf=include_cdf,
-                epsilon=epsilon,
-            ),
-            successful=DistributionSummary.from_iterable_request_times(
-                successful_requests,  # type: ignore[arg-type]
-                successful_first_iter_times,  # type: ignore[arg-type]
-                successful_iter_counts,  # type: ignore[arg-type]
-                successful_first_iter_counts,  # type: ignore[arg-type]
-                include_cdf=include_cdf,
-                epsilon=epsilon,
-            ),
-            incomplete=DistributionSummary.from_iterable_request_times(
-                incomplete_requests,  # type: ignore[arg-type]
-                incomplete_first_iter_times,  # type: ignore[arg-type]
-                incomplete_iter_counts,  # type: ignore[arg-type]
-                incomplete_first_iter_counts,  # type: ignore[arg-type]
-                include_cdf=include_cdf,
-                epsilon=epsilon,
-            ),
-            errored=DistributionSummary.from_iterable_request_times(
-                errored_requests,  # type: ignore[arg-type]
-                errored_first_iter_times,  # type: ignore[arg-type]
-                errored_iter_counts,  # type: ignore[arg-type]
-                errored_first_iter_counts,  # type: ignore[arg-type]
-                include_cdf=include_cdf,
-                epsilon=epsilon,
-            ),
-        )
-
-
-class RunningStats(StandardBaseModel):
-    """
-    Real-time statistics tracking for streaming numerical data.
-
-    Maintains mean, rate, and cumulative statistics for continuous data streams
-    without storing individual values. Optimized for memory efficiency in
-    long-running monitoring applications. Supports arithmetic operators for
-    convenient value addition and provides computed properties for derived metrics.
-
-    Example:
-    ::
-        stats = RunningStats()
-        stats += 10.5  # Add value using operator
-        stats.update(20.0, count=3)  # Add value with custom count
-        print(f"Mean: {stats.mean}, Rate: {stats.rate}")
-    """
-
-    start_time: float = Field(
-        default_factory=timer.time,
-        description=(
-            "The time the running statistics object was created. "
-            "This is used to calculate the rate of the statistics."
-        ),
-    )
-    count: int = Field(
-        default=0,
-        description="The number of values added to the running statistics.",
-    )
-    total: float = Field(
-        default=0.0,
-        description="The total sum of the values added to the running statistics.",
-    )
-    last: float = Field(
-        default=0.0,
-        description="The last value added to the running statistics.",
-    )
-
-    @computed_field  # type: ignore[misc]
-    @property
-    def mean(self) -> float:
-        """
-        :return: The mean of the running statistics (total / count).
-            If count is 0, return 0.0.
-        """
-        if self.count == 0:
-            return 0.0
-        return self.total / self.count
-
-    @computed_field  # type: ignore[misc]
-    @property
-    def rate(self) -> float:
-        """
-        :return: The rate of the running statistics
-            (total / (time.time() - start_time)).
-            If count is 0, return 0.0.
-        """
-        if self.count == 0:
-            return 0.0
-        return self.total / (timer.time() - self.start_time)
-
-    def __add__(self, value: Any) -> float:
-        """
-        Add value using + operator and return current mean.
-
-        :param value: Numerical value to add to the running statistics
-        :return: Updated mean after adding the value
-        :raises ValueError: If value is not numeric (int or float)
-        """
-        if not isinstance(value, int | float):
-            raise ValueError(
-                f"Value must be an int or float, got {type(value)} instead.",
-            )
-
-        self.update(value)
-
-        return self.mean
-
-    def __iadd__(self, value: Any) -> RunningStats:
-        """
-        Add value using += operator and return updated instance.
-
-        :param value: Numerical value to add to the running statistics
-        :return: Self reference for method chaining
-        :raises ValueError: If value is not numeric (int or float)
-        """
-        if not isinstance(value, int | float):
-            raise ValueError(
-                f"Value must be an int or float, got {type(value)} instead.",
-            )
-
-        self.update(value)
-
-        return self
-
-    def update(self, value: float, count: int = 1) -> None:
-        """
-        Update running statistics with new value and count.
-
-        :param value: Numerical value to add to the running statistics
-        :param count: Number of occurrences to count for this value (defaults to 1)
-        """
-        self.count += count
-        self.total += value
-        self.last = value
-
-
-class TimeRunningStats(RunningStats):
-    """
-    Specialized running statistics for time-based measurements.
-
-    Extends RunningStats with time-specific computed properties for millisecond
-    conversions. Designed for tracking latency, duration, and timing metrics in
-    performance monitoring applications.
-
-    Example:
-    ::
-        time_stats = TimeRunningStats()
-        time_stats += 0.125  # Add 125ms in seconds
-        print(f"Mean: {time_stats.mean_ms}ms, Total: {time_stats.total_ms}ms")
-    """
-
-    @computed_field  # type: ignore[misc]
-    @property
-    def total_ms(self) -> float:
-        """
-        :return: The total time multiplied by 1000.0 to convert to milliseconds.
-        """
-        return self.total * 1000.0
-
-    @computed_field  # type: ignore[misc]
-    @property
-    def last_ms(self) -> float:
-        """
-        :return: The last time multiplied by 1000.0 to convert to milliseconds.
-        """
-        return self.last * 1000.0
-
-    @computed_field  # type: ignore[misc]
-    @property
-    def mean_ms(self) -> float:
-        """
-        :return: The mean time multiplied by 1000.0 to convert to milliseconds.
-        """
-        return self.mean * 1000.0
-
-    @computed_field  # type: ignore[misc]
-    @property
-    def rate_ms(self) -> float:
-        """
-        :return: The rate of the running statistics multiplied by 1000.0
-            to convert to milliseconds.
-        """
-        return self.rate * 1000.0
diff --git a/tests/unit/benchmark/test_output.py b/tests/unit/benchmark/test_output.py
index 416a9b2b..590f40f0 100644
--- a/tests/unit/benchmark/test_output.py
+++ b/tests/unit/benchmark/test_output.py
@@ -10,7 +10,7 @@
 from guidellm.benchmark import (
     GenerativeBenchmarksReport,
 )
-from guidellm.benchmark.output import (
+from guidellm.benchmark.outputs.output import (
     GenerativeBenchmarkerConsole,
     GenerativeBenchmarkerCSV,
 )
diff --git a/tests/unit/mock_benchmark.py b/tests/unit/mock_benchmark.py
index 0546d28f..9da4227d 100644
--- a/tests/unit/mock_benchmark.py
+++ b/tests/unit/mock_benchmark.py
@@ -1,7 +1,7 @@
 """Mock benchmark objects for unit testing."""
 
 from guidellm.benchmark import (
-    BenchmarkSchedulerStats,
+    BenchmarkSchedulerMetrics,
     GenerativeBenchmark,
     GenerativeMetrics,
 )
@@ -113,7 +113,7 @@ def mock_generative_benchmark() -> GenerativeBenchmark:
         ),
         env_args=StandardBaseDict(),
         extras=StandardBaseDict(),
-        run_stats=BenchmarkSchedulerStats(
+        run_stats=BenchmarkSchedulerMetrics(
             start_time=1,
             end_time=2,
             requests_made=StatusBreakdown(
diff --git a/tests/unit/presentation/__init__.py b/tests/unit/presentation/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/tests/unit/presentation/test_data_models.py b/tests/unit/presentation/test_data_models.py
deleted file mode 100644
index c1663c43..00000000
--- a/tests/unit/presentation/test_data_models.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import pytest
-
-from guidellm.presentation.data_models import Bucket
-
-
-@pytest.mark.smoke
-def test_bucket_from_data():
-    buckets, bucket_width = Bucket.from_data([8, 8, 8, 8, 8, 8], 1)
-    assert len(buckets) == 1
-    assert buckets[0].value == 8.0
-    assert buckets[0].count == 6
-    assert bucket_width == 1
-
-    buckets, bucket_width = Bucket.from_data([8, 8, 8, 8, 8, 7], 1)
-    assert len(buckets) == 2
-    assert buckets[0].value == 7.0
-    assert buckets[0].count == 1
-    assert buckets[1].value == 8.0
-    assert buckets[1].count == 5
-    assert bucket_width == 1
diff --git a/tests/unit/presentation/test_injector.py b/tests/unit/presentation/test_injector.py
deleted file mode 100644
index da269815..00000000
--- a/tests/unit/presentation/test_injector.py
+++ /dev/null
@@ -1,87 +0,0 @@
-from pathlib import Path
-
-import pytest
-from pydantic import BaseModel
-
-from guidellm.presentation.injector import create_report, inject_data
-from guidellm.settings import settings
-
-
-class ExampleModel(BaseModel):
-    name: str
-    version: str
-
-
-@pytest.mark.smoke
-def test_inject_data():
-    html = "<head><script>window.runInfo = {};</script></head>"
-    expected_html = (
-        "<head><script>"
-        "window.runInfo ="
-        '{ "model": { "name": "neuralmagic/Qwen2.5-7B-quantized.w8a8" } };'
-        "</script></head>"
-    )
-    js_data = {
-        "window.runInfo = {};": "window.runInfo ="
-        '{ "model": { "name": "neuralmagic/Qwen2.5-7B-quantized.w8a8" } };'
-    }
-    result = inject_data(
-        js_data,
-        html,
-    )
-    assert result == expected_html
-
-
-@pytest.mark.smoke
-def test_create_report_to_file(tmpdir):
-    js_data = {
-        "window.runInfo = {};": "window.runInfo ="
-        '{ "model": { "name": "neuralmagic/Qwen2.5-7B-quantized.w8a8" } };'
-    }
-    html_content = "<head><script>window.runInfo = {};</script></head>"
-    expected_html_content = (
-        "<head><script>"
-        "window.runInfo ="
-        '{ "model": { "name": "neuralmagic/Qwen2.5-7B-quantized.w8a8" } };'
-        "</script></head>"
-    )
-
-    mock_html_path = tmpdir.join("template.html")
-    mock_html_path.write(html_content)
-    settings.report_generation.source = str(mock_html_path)
-
-    output_path = tmpdir.join("output.html")
-    result_path = create_report(js_data, str(output_path))
-    result_content = result_path.read_text()
-
-    assert result_path == output_path
-    assert result_content == expected_html_content
-
-
-@pytest.mark.smoke
-def test_create_report_with_file_nested_in_dir(tmpdir):
-    js_data = {
-        "window.runInfo = {};": "window.runInfo ="
-        '{ "model": { "name": "neuralmagic/Qwen2.5-7B-quantized.w8a8" } };'
-    }
-    html_content = "<head><script>window.runInfo = {};</script></head>"
-    expected_html_content = (
-        "<head><script>"
-        "window.runInfo ="
-        '{ "model": { "name": "neuralmagic/Qwen2.5-7B-quantized.w8a8" } };'
-        "</script></head>"
-    )
-
-    output_dir = tmpdir.mkdir("output_dir")
-    mock_html_path = tmpdir.join("template.html")
-    mock_html_path.write(html_content)
-    settings.report_generation.source = str(mock_html_path)
-
-    output_path = Path(output_dir) / "report.html"
-    result_path = create_report(js_data, str(output_path))
-
-    with Path(result_path).open("r") as file:
-        result_content = file.read()
-
-    assert result_path == output_path
-    assert result_content == expected_html_content
diff --git a/tests/unit/utils/test_pydantic_utils.py b/tests/unit/utils/test_pydantic_utils.py
index b1278f51..d57b0663 100644
--- a/tests/unit/utils/test_pydantic_utils.py
+++ b/tests/unit/utils/test_pydantic_utils.py
@@ -10,14 +10,7 @@
 import pytest
 from pydantic import BaseModel, Field, ValidationError
 
-from guidellm.utils import (
-    PydanticClassRegistryMixin,
-    ReloadableBaseModel,
-    StandardBaseDict,
-    StandardBaseModel,
-    StatusBreakdown,
-)
-from guidellm.utils.pydantic_utils import (
+from guidellm.schemas.base import (
     BaseModelT,
     ErroredT,
     IncompleteT,
@@ -25,6 +18,13 @@
     SuccessfulT,
     TotalT,
 )
+from guidellm.utils import (
+    PydanticClassRegistryMixin,
+    ReloadableBaseModel,
+    StandardBaseDict,
+    StandardBaseModel,
+    StatusBreakdown,
+)
 
 
 @pytest.mark.smoke
diff --git a/tests/unit/utils/test_statistics.py b/tests/unit/utils/test_statistics.py
index d0f04d99..73c383f2 100644
--- a/tests/unit/utils/test_statistics.py
+++ b/tests/unit/utils/test_statistics.py
@@ -1,785 +1,1123 @@
+from __future__ import annotations
+
 import math
-import time
 from typing import Literal
 
 import numpy as np
 import pytest
+from pydantic import BaseModel, ValidationError
 
-from guidellm.utils.statistics import (
-    DistributionSummary,
-    Percentiles,
-    RunningStats,
-    StatusDistributionSummary,
-    TimeRunningStats,
-)
-
-
-def create_default_percentiles() -> Percentiles:
-    return Percentiles(
-        p001=0.1,
-        p01=1.0,
-        p05=5.0,
-        p10=10.0,
-        p25=25.0,
-        p50=50.0,
-        p75=75.0,
-        p90=90.0,
-        p95=95.0,
-        p99=99.0,
-        p999=99.9,
-    )
-
-
-def create_default_distribution_summary() -> DistributionSummary:
-    return DistributionSummary(
-        mean=50.0,
-        median=50.0,
-        mode=50.0,
-        variance=835,
-        std_dev=math.sqrt(835),
-        min=0.0,
-        max=100.0,
-        count=1001,
-        total_sum=50050.0,
-        percentiles=create_default_percentiles(),
-    )
-
-
-@pytest.mark.smoke
-def test_percentiles_initialization():
-    percentiles = create_default_percentiles()
-    assert percentiles.p001 == 0.1
-    assert percentiles.p01 == 1.0
-    assert percentiles.p05 == 5.0
-    assert percentiles.p10 == 10.0
-    assert percentiles.p25 == 25.0
-    assert percentiles.p50 == 50.0
-    assert percentiles.p75 == 75.0
-    assert percentiles.p90 == 90.0
-    assert percentiles.p95 == 95.0
-    assert percentiles.p99 == 99.0
-    assert percentiles.p999 == 99.9
-
-
-@pytest.mark.smoke
-def test_percentiles_invalid_initialization():
-    test_kwargs = {
-        "p001": 0.1,
-        "p01": 1.0,
-        "p05": 5.0,
-        "p10": 10.0,
-        "p25": 25.0,
-        "p50": 50.0,
-        "p75": 75.0,
-        "p90": 90.0,
-        "p95": 95.0,
-        "p99": 99.0,
-        "p999": 99.9,
-    }
-    test_missing_keys = list(test_kwargs.keys())
-
-    for missing_key in test_missing_keys:
-        kwargs = {key: val for key, val in test_kwargs.items() if key != missing_key}
-        with pytest.raises(ValueError):
-            Percentiles(**kwargs)
-
-
-@pytest.mark.smoke
-def test_percentiles_marshalling():
-    percentiles = create_default_percentiles()
-    serialized = percentiles.model_dump()
-    deserialized = Percentiles.model_validate(serialized)
-
-    for key, value in vars(percentiles).items():
-        assert getattr(deserialized, key) == value
-
-
-@pytest.mark.smoke
-def test_distribution_summary_initilaization():
-    distribution_summary = create_default_distribution_summary()
-    assert distribution_summary.mean == 50.0
-    assert distribution_summary.median == 50.0
-    assert distribution_summary.mode == 50.0
-    assert distribution_summary.variance == 835
-    assert distribution_summary.std_dev == math.sqrt(835)
-    assert distribution_summary.min == 0.0
-    assert distribution_summary.max == 100.0
-    assert distribution_summary.count == 1001
-    assert distribution_summary.total_sum == 50050.0
-    assert distribution_summary.percentiles.p001 == 0.1
-    assert distribution_summary.percentiles.p01 == 1.0
-    assert distribution_summary.percentiles.p05 == 5.0
-    assert distribution_summary.percentiles.p10 == 10.0
-    assert distribution_summary.percentiles.p25 == 25.0
-    assert distribution_summary.percentiles.p50 == 50.0
-    assert distribution_summary.percentiles.p75 == 75.0
-    assert distribution_summary.percentiles.p90 == 90.0
-    assert distribution_summary.percentiles.p95 == 95.0
-    assert distribution_summary.percentiles.p99 == 99.0
-    assert distribution_summary.percentiles.p999 == 99.9
-
-
-@pytest.mark.smoke
-def test_distribution_summary_invalid_initialization():
-    test_kwargs = {
-        "mean": 50.0,
-        "median": 50.0,
-        "mode": 50.0,
-        "variance": 835,
-        "std_dev": math.sqrt(835),
-        "min": 0.0,
-        "max": 100.0,
-        "count": 1001,
-        "total_sum": 50050.0,
-        "percentiles": create_default_percentiles(),
-    }
-    test_missing_keys = list(test_kwargs.keys())
-    for missing_key in test_missing_keys:
-        kwargs = {key: val for key, val in test_kwargs.items() if key != missing_key}
-        with pytest.raises(ValueError):
-            DistributionSummary(**kwargs)  # type: ignore[arg-type]
-
-
-@pytest.mark.smoke
-def test_distribution_summary_marshalling():
-    distribution_summary = create_default_distribution_summary()
-    serialized = distribution_summary.model_dump()
-    deserialized = DistributionSummary.model_validate(serialized)
-
-    for key, value in vars(distribution_summary).items():
-        assert getattr(deserialized, key) == value
-
-
-@pytest.mark.smoke
-def test_distribution_summary_from_distribution_function():
-    values = [val / 10.0 for val in range(1001)]
-    distribution = [(val, 1.0) for val in values]
-    distribution_summary = DistributionSummary.from_distribution_function(distribution)
-    assert distribution_summary.mean == pytest.approx(np.mean(values))
-    assert distribution_summary.median == pytest.approx(np.median(values))
-    assert distribution_summary.mode == 0.0
-    assert distribution_summary.variance == pytest.approx(np.var(values, ddof=0))
-    assert distribution_summary.std_dev == pytest.approx(np.std(values, ddof=0))
-    assert distribution_summary.min == min(values)
-    assert distribution_summary.max == max(values)
-    assert distribution_summary.count == len(values)
-    assert distribution_summary.total_sum == sum(values)
-    assert distribution_summary.percentiles.p001 == pytest.approx(
-        np.percentile(values, 0.1)
-    )
-    assert distribution_summary.percentiles.p01 == pytest.approx(
-        np.percentile(values, 1.0)
-    )
-    assert distribution_summary.percentiles.p05 == pytest.approx(
-        np.percentile(values, 5.0)
-    )
-    assert distribution_summary.percentiles.p10 == pytest.approx(
-        np.percentile(values, 10.0)
-    )
-    assert distribution_summary.percentiles.p25 == pytest.approx(
-        np.percentile(values, 25.0)
-    )
-    assert distribution_summary.percentiles.p50 == pytest.approx(
-        np.percentile(values, 50.0)
-    )
-    assert distribution_summary.percentiles.p75 == pytest.approx(
-        np.percentile(values, 75.0)
-    )
-    assert distribution_summary.percentiles.p90 == pytest.approx(
-        np.percentile(values, 90.0)
-    )
-    assert distribution_summary.percentiles.p95 == pytest.approx(
-        np.percentile(values, 95.0)
-    )
-    assert distribution_summary.percentiles.p99 == pytest.approx(
-        np.percentile(values, 99.0)
-    )
-    assert distribution_summary.percentiles.p999 == pytest.approx(
-        np.percentile(values, 99.9)
-    )
-    assert distribution_summary.cumulative_distribution_function is None
-
-    distribution_summary_cdf = DistributionSummary.from_distribution_function(
-        distribution, include_cdf=True
-    )
-    assert distribution_summary_cdf.cumulative_distribution_function is not None
-    assert len(distribution_summary_cdf.cumulative_distribution_function) == len(values)
-
-
-def test_distribution_summary_from_values():
-    values = [val / 10 for val in range(1001)]
-    distribution_summary = DistributionSummary.from_values(values)
-    assert distribution_summary.mean == pytest.approx(np.mean(values))
-    assert distribution_summary.median == pytest.approx(np.median(values))
-    assert distribution_summary.mode == 0.0
-    assert distribution_summary.variance == pytest.approx(np.var(values, ddof=0))
-    assert distribution_summary.std_dev == pytest.approx(np.std(values, ddof=0))
-    assert distribution_summary.min == min(values)
-    assert distribution_summary.max == max(values)
-    assert distribution_summary.count == len(values)
-    assert distribution_summary.total_sum == sum(values)
-    assert distribution_summary.percentiles.p001 == pytest.approx(
-        np.percentile(values, 0.1)
-    )
-    assert distribution_summary.percentiles.p01 == pytest.approx(
-        np.percentile(values, 1.0)
-    )
-    assert distribution_summary.percentiles.p05 == pytest.approx(
-        np.percentile(values, 5.0)
-    )
-    assert distribution_summary.percentiles.p10 == pytest.approx(
-        np.percentile(values, 10.0)
-    )
-    assert distribution_summary.percentiles.p25 == pytest.approx(
-        np.percentile(values, 25.0)
-    )
-    assert distribution_summary.percentiles.p50 == pytest.approx(
-        np.percentile(values, 50.0)
-    )
-    assert distribution_summary.percentiles.p75 == pytest.approx(
-        np.percentile(values, 75.0)
-    )
-    assert distribution_summary.percentiles.p90 == pytest.approx(
-        np.percentile(values, 90.0)
-    )
-    assert distribution_summary.percentiles.p95 == pytest.approx(
-        np.percentile(values, 95.0)
-    )
-    assert distribution_summary.percentiles.p99 == pytest.approx(
-        np.percentile(values, 99.0)
-    )
-    assert distribution_summary.percentiles.p999 == pytest.approx(
-        np.percentile(values, 99.9)
-    )
-    assert distribution_summary.cumulative_distribution_function is None
+from guidellm.utils import DistributionSummary, Percentiles, StatusDistributionSummary
 
-    distribution_summary_weights = DistributionSummary.from_values(
-        values, weights=[2] * len(values)
-    )
-    assert distribution_summary_weights.mean == pytest.approx(np.mean(values))
-    assert distribution_summary_weights.median == pytest.approx(np.median(values))
-    assert distribution_summary_weights.mode == 0.0
-    assert distribution_summary_weights.variance == pytest.approx(
-        np.var(values, ddof=0)
-    )
-    assert distribution_summary_weights.std_dev == pytest.approx(np.std(values, ddof=0))
-    assert distribution_summary_weights.min == min(values)
-    assert distribution_summary_weights.max == max(values)
-    assert distribution_summary_weights.count == len(values)
-    assert distribution_summary_weights.total_sum == sum(values)
-    assert distribution_summary_weights.cumulative_distribution_function is None
-
-    distribution_summary_cdf = DistributionSummary.from_values(values, include_cdf=True)
-    assert distribution_summary_cdf.cumulative_distribution_function is not None
-    assert len(distribution_summary_cdf.cumulative_distribution_function) == len(values)
-
-
-def test_distribution_summary_from_request_times_concurrency():
-    # create consistent timestamped values matching a rate of 10 per second
-    requests = [(val / 10, val / 10 + 1) for val in range(10001)]
-    distribution_summary = DistributionSummary.from_request_times(
-        requests, distribution_type="concurrency"
-    )
-    assert distribution_summary.mean == pytest.approx(10.0, abs=0.01)
-    assert distribution_summary.median == pytest.approx(10.0)
-    assert distribution_summary.mode == 10.0
-    assert distribution_summary.variance == pytest.approx(0, abs=0.1)
-    assert distribution_summary.std_dev == pytest.approx(0, abs=0.3)
-    assert distribution_summary.min == pytest.approx(1)
-    assert distribution_summary.max == pytest.approx(10.0)
-    assert distribution_summary.count == 10
-    assert distribution_summary.total_sum == pytest.approx(55.0)
-    assert distribution_summary.percentiles.p001 == pytest.approx(10, abs=5)
-    assert distribution_summary.percentiles.p01 == pytest.approx(10)
-    assert distribution_summary.percentiles.p05 == pytest.approx(10)
-    assert distribution_summary.percentiles.p10 == pytest.approx(10)
-    assert distribution_summary.percentiles.p25 == pytest.approx(10)
-    assert distribution_summary.percentiles.p50 == pytest.approx(10)
-    assert distribution_summary.percentiles.p75 == pytest.approx(10)
-    assert distribution_summary.percentiles.p90 == pytest.approx(10)
-    assert distribution_summary.percentiles.p95 == pytest.approx(10)
-    assert distribution_summary.percentiles.p99 == pytest.approx(10)
-    assert distribution_summary.percentiles.p999 == pytest.approx(10)
-    assert distribution_summary.cumulative_distribution_function is None
-
-    distribution_summary_cdf = DistributionSummary.from_request_times(
-        requests, distribution_type="concurrency", include_cdf=True
-    )
-    assert distribution_summary_cdf.cumulative_distribution_function is not None
-    assert len(distribution_summary_cdf.cumulative_distribution_function) == 10
 
+def generate_pdf(
+    distribution: str | None, distribution_args: dict, size: int
+) -> np.ndarray:
+    if distribution is None:
+        return np.empty((0, 2))
 
-def test_distribution_summary_from_request_times_rate():
-    # create consistent timestamped values matching a rate of 10 per second
-    requests = [(val / 10, val / 10 + 1) for val in range(10001)]
-    distribution_summary = DistributionSummary.from_request_times(
-        requests, distribution_type="rate"
-    )
-    assert distribution_summary.mean == pytest.approx(10.0, abs=0.01)
-    assert distribution_summary.median == pytest.approx(10.0)
-    assert distribution_summary.mode == pytest.approx(10.0)
-    assert distribution_summary.variance == pytest.approx(0, abs=0.1)
-    assert distribution_summary.std_dev == pytest.approx(0, abs=0.3)
-    assert distribution_summary.min == pytest.approx(1.0)
-    assert distribution_summary.max == pytest.approx(10.0)
-    assert distribution_summary.count == 12
-    assert distribution_summary.total_sum == pytest.approx(111.0)
-    assert distribution_summary.percentiles.p001 == pytest.approx(10.0, abs=0.5)
-    assert distribution_summary.percentiles.p01 == pytest.approx(10.0)
-    assert distribution_summary.percentiles.p05 == pytest.approx(10.0)
-    assert distribution_summary.percentiles.p10 == pytest.approx(10.0)
-    assert distribution_summary.percentiles.p25 == pytest.approx(10.0)
-    assert distribution_summary.percentiles.p50 == pytest.approx(10.0)
-    assert distribution_summary.percentiles.p75 == pytest.approx(10.0)
-    assert distribution_summary.percentiles.p90 == pytest.approx(10.0)
-    assert distribution_summary.percentiles.p95 == pytest.approx(10.0)
-    assert distribution_summary.percentiles.p99 == pytest.approx(10.0)
-    assert distribution_summary.percentiles.p999 == pytest.approx(10.0)
-    assert distribution_summary.cumulative_distribution_function is None
-
-    distribution_summary_cdf = DistributionSummary.from_request_times(
-        requests, distribution_type="rate", include_cdf=True
-    )
-    assert distribution_summary_cdf.cumulative_distribution_function is not None
-    assert len(distribution_summary_cdf.cumulative_distribution_function) == 12
-
-
-def test_distribution_summary_from_iterable_request_times():
-    # create consistent timestamped values matching a rate of 10 per second
-    requests = [(val / 10, val / 10 + 1) for val in range(10001)]
-    # create 9 iterations for each request with first iter at start + 0.1
-    # and spaced at 0.1 seconds apart
-    first_iter_times = [val / 10 + 0.1 for val in range(10001)]
-    iter_counts = [9 for _ in range(10001)]
-    first_iter_counts = [1 for _ in range(10001)]
-
-    distribution_summary = DistributionSummary.from_iterable_request_times(
-        requests, first_iter_times, iter_counts, first_iter_counts
-    )
-    assert distribution_summary.mean == pytest.approx(90.0, abs=0.1)
-    assert distribution_summary.median == pytest.approx(80.0)
-    assert distribution_summary.mode == pytest.approx(80.0)
-    assert distribution_summary.variance == pytest.approx(704.463, abs=0.001)
-    assert distribution_summary.std_dev == pytest.approx(26.541, abs=0.001)
-    assert distribution_summary.min == pytest.approx(0.0)
-    assert distribution_summary.max == pytest.approx(160.0)
-    assert distribution_summary.count == 44
-    assert distribution_summary.total_sum == pytest.approx(3538.85, abs=0.01)
-    assert distribution_summary.percentiles.p001 == pytest.approx(80.0)
-    assert distribution_summary.percentiles.p01 == pytest.approx(80.0)
-    assert distribution_summary.percentiles.p05 == pytest.approx(80.0)
-    assert distribution_summary.percentiles.p10 == pytest.approx(80.0)
-    assert distribution_summary.percentiles.p25 == pytest.approx(80.0)
-    assert distribution_summary.percentiles.p50 == pytest.approx(80.0)
-    assert distribution_summary.percentiles.p75 == pytest.approx(80.0)
-    assert distribution_summary.percentiles.p90 == pytest.approx(160.0)
-    assert distribution_summary.percentiles.p95 == pytest.approx(160.0)
-    assert distribution_summary.percentiles.p99 == pytest.approx(160.0)
-    assert distribution_summary.percentiles.p999 == pytest.approx(160.0)
-    assert distribution_summary.cumulative_distribution_function is None
-
-    distribution_summary_cdf = DistributionSummary.from_iterable_request_times(
-        requests, first_iter_times, iter_counts, first_iter_counts, include_cdf=True
-    )
-    assert distribution_summary_cdf.cumulative_distribution_function is not None
-    assert len(distribution_summary_cdf.cumulative_distribution_function) == 44
-
-
-def test_status_distribution_summary_initialization():
-    status_distribution_summary = StatusDistributionSummary(
-        total=create_default_distribution_summary(),
-        successful=create_default_distribution_summary(),
-        incomplete=create_default_distribution_summary(),
-        errored=create_default_distribution_summary(),
-    )
-    assert status_distribution_summary.total.mean == 50.0
-    assert status_distribution_summary.successful.mean == 50.0
-    assert status_distribution_summary.incomplete.mean == 50.0
-    assert status_distribution_summary.errored.mean == 50.0
-
-
-def test_status_distribution_summary_marshalling():
-    status_distribution_summary = StatusDistributionSummary(
-        total=create_default_distribution_summary(),
-        successful=create_default_distribution_summary(),
-        incomplete=create_default_distribution_summary(),
-        errored=create_default_distribution_summary(),
-    )
-    serialized = status_distribution_summary.model_dump()
-    deserialized = StatusDistributionSummary.model_validate(serialized)
-
-    for key, value in vars(status_distribution_summary).items():
-        for child_key, child_value in vars(value).items():
-            assert getattr(getattr(deserialized, key), child_key) == child_value
-
-
-def test_status_distribution_summary_from_values():
-    value_types: list[Literal["successful", "incomplete", "error"]] = [
-        "successful",
-        "incomplete",
-        "error",
-    ] * 1000
-    values = [float(val % 3) for val in range(3000)]
-    status_distribution_summary = StatusDistributionSummary.from_values(
-        value_types, values
-    )
-    assert status_distribution_summary.total.count == len(values)
-    assert status_distribution_summary.total.mean == pytest.approx(np.mean(values))
-    assert status_distribution_summary.total.cumulative_distribution_function is None
-    assert status_distribution_summary.successful.mean == pytest.approx(
-        np.mean(
-            [val for ind, val in enumerate(values) if value_types[ind] == "successful"]
+    if distribution == "normal":
+        mean = distribution_args.get("loc", 0.0)
+        std_dev = distribution_args.get("scale", 1.0)
+        x_values = np.linspace(mean - 4 * std_dev, mean + 4 * std_dev, size)
+        pdf_values = (1.0 / np.sqrt(2 * np.pi * std_dev**2)) * np.exp(
+            -1.0 * ((x_values - mean) ** 2) / (2 * std_dev**2)
         )
-    )
-    assert status_distribution_summary.successful.count == len(
-        [val for ind, val in enumerate(values) if value_types[ind] == "successful"]
-    )
-    assert (
-        status_distribution_summary.successful.cumulative_distribution_function is None
-    )
-    assert status_distribution_summary.incomplete.mean == pytest.approx(
-        np.mean(
-            [val for ind, val in enumerate(values) if value_types[ind] == "incomplete"]
+    elif distribution == "uniform":
+        low = distribution_args.get("low", 0.0)
+        high = distribution_args.get("high", 1.0)
+        x_values = np.linspace(low, high, size)
+        pdf_values = np.full_like(x_values, 1.0 / (high - low))
+    elif distribution == "exponential":
+        scale = distribution_args.get("scale", 1.0)
+        x_values = np.linspace(0, 10 * scale, size)
+        pdf_values = (1 / scale) * np.exp(-x_values / scale)
+    elif distribution == "poisson":
+        lam = distribution_args.get("lam", 1.0)
+        x_values = np.arange(0, 20)
+        pdf_values = (lam**x_values * np.exp(-lam)) / np.array(
+            [math.factorial(x) for x in x_values]
         )
-    )
-    assert status_distribution_summary.incomplete.count == len(
-        [val for ind, val in enumerate(values) if value_types[ind] == "incomplete"]
-    )
-    assert (
-        status_distribution_summary.incomplete.cumulative_distribution_function is None
-    )
-    assert status_distribution_summary.errored.mean == pytest.approx(
-        np.mean([val for ind, val in enumerate(values) if value_types[ind] == "error"])
-    )
-    assert status_distribution_summary.errored.count == len(
-        [val for ind, val in enumerate(values) if value_types[ind] == "error"]
-    )
-    assert status_distribution_summary.errored.cumulative_distribution_function is None
-
-    status_distribution_summary_cdf = StatusDistributionSummary.from_values(
-        value_types, values, include_cdf=True
-    )
-    assert (
-        status_distribution_summary_cdf.total.cumulative_distribution_function
-        is not None
-    )
-    assert (
-        status_distribution_summary_cdf.successful.cumulative_distribution_function
-        is not None
-    )
-    assert (
-        status_distribution_summary_cdf.incomplete.cumulative_distribution_function
-        is not None
-    )
-    assert (
-        status_distribution_summary_cdf.errored.cumulative_distribution_function
-        is not None
-    )
-
-
-def test_status_distribution_summary_from_request_times():
-    request_types: list[Literal["successful", "incomplete", "error"]] = [
-        "successful",
-        "incomplete",
-        "error",
-    ] * 1000
-    requests = [((val % 3) / 10, (val % 3) / 10 + 1) for val in range(3000)]
-    status_distribution_summary = StatusDistributionSummary.from_request_times(
-        request_types, requests, distribution_type="concurrency"
-    )
-    assert status_distribution_summary.total.mean == pytest.approx(2500.0, abs=0.01)
-    assert status_distribution_summary.total.cumulative_distribution_function is None
-    assert status_distribution_summary.successful.mean == pytest.approx(
-        1000.0, abs=0.01
-    )
-    assert (
-        status_distribution_summary.successful.cumulative_distribution_function is None
-    )
-    assert status_distribution_summary.incomplete.mean == pytest.approx(
-        1000.0, abs=0.01
-    )
-    assert (
-        status_distribution_summary.incomplete.cumulative_distribution_function is None
-    )
-    assert status_distribution_summary.errored.mean == pytest.approx(1000.0, abs=0.01)
-    assert status_distribution_summary.errored.cumulative_distribution_function is None
-
-    status_distribution_summary_cdf = StatusDistributionSummary.from_request_times(
-        request_types, requests, distribution_type="concurrency", include_cdf=True
-    )
-    assert (
-        status_distribution_summary_cdf.total.cumulative_distribution_function
-        is not None
-    )
-    assert (
-        status_distribution_summary_cdf.successful.cumulative_distribution_function
-        is not None
-    )
-    assert (
-        status_distribution_summary_cdf.incomplete.cumulative_distribution_function
-        is not None
-    )
-    assert (
-        status_distribution_summary_cdf.errored.cumulative_distribution_function
-        is not None
-    )
+    else:
+        raise ValueError(f"Unsupported distribution type: {distribution}")
+
+    return np.column_stack((x_values, pdf_values / np.sum(pdf_values)))
+
+
+@pytest.fixture(
+    params=[
+        {"distribution": None, "distribution_args": {}},
+        {
+            "distribution": "normal",
+            "distribution_args": {"loc": 5.0, "scale": 1.0},
+        },
+        {
+            "distribution": "normal",
+            "distribution_args": {"loc": 100.0, "scale": 15.0},
+        },
+        {"distribution": "uniform", "distribution_args": {"low": 3.4, "high": 9.8}},
+        {
+            "distribution": "exponential",
+            "distribution_args": {"scale": 1.0},
+        },
+        {
+            "distribution": "poisson",
+            "distribution_args": {"lam": 5.0},
+        },
+    ]
+)
+def probability_distributions(
+    request,
+) -> tuple[str | None, np.ndarray, np.ndarray, dict[str, float]]:
+    """
+    Create various probability distributions for testing.
 
+    :return: A tuple containing the distribution type, the generated values,
+        the pdf, and the correct distribution statistics.
+    """
+    distribution_type: str | None = request.param["distribution"]
+    distribution_args: dict[str, float] = request.param["distribution_args"]
+
+    num_samples = 10000
+    rng = np.random.default_rng(seed=42)
+    percentile_probs = {
+        "p001": 0.001,
+        "p01": 0.01,
+        "p05": 0.05,
+        "p10": 0.1,
+        "p25": 0.25,
+        "p50": 0.5,
+        "p75": 0.75,
+        "p90": 0.9,
+        "p95": 0.95,
+        "p99": 0.99,
+        "p999": 0.999,
+    }
 
-def test_status_distribution_summary_from_iterable_request_times():
-    request_types: list[Literal["successful", "incomplete", "error"]] = [
-        "successful",
-        "incomplete",
-        "error",
-    ] * 1000
-    requests = [(val % 3 / 10, val % 3 / 10 + 1) for val in range(3000)]
-    first_iter_times = [val % 3 / 10 + 0.1 for val in range(3000)]
-    iter_counts = [9 for _ in range(3000)]
-    first_iter_counts = [1 for _ in range(3000)]
-    status_distribution_summary = StatusDistributionSummary.from_iterable_request_times(
-        request_types,
-        requests,
-        first_iter_times,
-        iter_counts,
-        first_iter_counts,
-    )
-    assert status_distribution_summary.total.mean == pytest.approx(21666.66, abs=0.01)
-    assert status_distribution_summary.total.cumulative_distribution_function is None
-    assert status_distribution_summary.successful.mean == pytest.approx(
-        8000.0, abs=0.01
-    )
-    assert (
-        status_distribution_summary.successful.cumulative_distribution_function is None
-    )
-    assert status_distribution_summary.incomplete.mean == pytest.approx(
-        8000.0, abs=0.01
-    )
-    assert (
-        status_distribution_summary.incomplete.cumulative_distribution_function is None
-    )
-    assert status_distribution_summary.errored.mean == pytest.approx(8000.0, abs=0.01)
-    assert status_distribution_summary.errored.cumulative_distribution_function is None
+    if distribution_type is None:
+        # Empty / 0's distribution
+        return (
+            None,
+            [],
+            np.empty((0, 2)),
+            {
+                "mean": 0.0,
+                "median": 0.0,
+                "mode": 0.0,
+                "variance": 0.0,
+                "std_dev": 0.0,
+                "min": 0.0,
+                "max": 0.0,
+                "count": 0,
+                "total_sum": 0.0,
+                "percentiles": dict.fromkeys(percentile_probs.keys(), 0.0),
+            },
+        )
 
-    status_distribution_summary_cdf = (
-        StatusDistributionSummary.from_iterable_request_times(
-            request_types,
+    rng = np.random.default_rng(seed=42)
+    samples = getattr(rng, distribution_type)(**distribution_args, size=num_samples)
+    pdf = np.column_stack(
+        (np.sort(samples), np.zeros_like(samples) + 1.0 / num_samples)
+    )
+
+    return (
+        distribution_type,
+        samples,
+        pdf,
+        {
+            "mean": float(np.mean(samples)),
+            "median": float(np.median(samples)),
+            "variance": float(np.var(samples)),
+            "std_dev": float(np.std(samples)),
+            "min": float(np.min(samples)),
+            "max": float(np.max(samples)),
+            "count": int(len(samples)),
+            "total_sum": float(np.sum(samples)),
+            "percentiles": {
+                key: float(np.percentile(samples, per * 100))
+                for key, per in percentile_probs.items()
+            },
+        },
+    )
+
+
+def concurrency_distributions(
+    concurrency_type: Literal[
+        "sequential",
+        "parallel",
+        "constant_rate",
+        "burst",
+        "triangular_ramp",
+        "normal_dist",
+    ],
+    num_requests: int = 100,
+    start_time: float = 0.0,
+    end_time: float = 100.0,
+) -> tuple[
+    Literal["sequential", "parallel", "constant_rate", "burst", "triangular_ramp"],
+    np.ndarray,
+    dict[str, float],
+]:
+    if concurrency_type == "sequential":
+        timings = np.linspace(start_time, end_time, num_requests + 1)
+        requests = np.column_stack((timings[:-1], timings[1:]))
+
+        return (
+            concurrency_type,
             requests,
-            first_iter_times,
-            iter_counts,
-            first_iter_counts,
-            include_cdf=True,
+            {
+                "start_time": None,
+                "end_time": None,
+                "mean_concurrency": 1.0,
+                "median_concurrency": 1.0,
+                "std_dev_concurrency": 0.0,
+            },
         )
-    )
-    assert (
-        status_distribution_summary_cdf.total.cumulative_distribution_function
-        is not None
-    )
-    assert (
-        status_distribution_summary_cdf.successful.cumulative_distribution_function
-        is not None
-    )
-    assert (
-        status_distribution_summary_cdf.incomplete.cumulative_distribution_function
-        is not None
-    )
-    assert (
-        status_distribution_summary_cdf.errored.cumulative_distribution_function
-        is not None
-    )
-
-
-def test_running_stats_initialization():
-    running_stats = RunningStats()
-    assert running_stats.start_time == pytest.approx(time.time(), abs=0.01)
-    assert running_stats.count == 0
-    assert running_stats.total == 0
-    assert running_stats.last == 0
-    assert running_stats.mean == 0
-    assert running_stats.rate == 0
-
-
-def test_running_stats_marshalling():
-    running_stats = RunningStats()
-    serialized = running_stats.model_dump()
-    deserialized = RunningStats.model_validate(serialized)
 
-    for key, value in vars(running_stats).items():
-        assert getattr(deserialized, key) == value
-
-
-def test_running_stats_update():
-    running_stats = RunningStats()
-    running_stats.update(1)
-    assert running_stats.count == 1
-    assert running_stats.total == 1
-    assert running_stats.last == 1
-    assert running_stats.mean == 1
-    time.sleep(1.0)
-    assert running_stats.rate == pytest.approx(
-        1.0 / (time.time() - running_stats.start_time), abs=0.1
-    )
-
-    running_stats.update(2)
-    assert running_stats.count == 2
-    assert running_stats.total == 3
-    assert running_stats.last == 2
-    assert running_stats.mean == 1.5
-    time.sleep(1)
-    assert running_stats.rate == pytest.approx(
-        3 / (time.time() - running_stats.start_time), abs=0.1
-    )
+    if concurrency_type == "parallel":
+        requests = np.column_stack(
+            (np.ones(num_requests) * start_time, np.ones(num_requests) * end_time)
+        )
 
+        return (
+            concurrency_type,
+            requests,
+            {
+                "start_time": None,
+                "end_time": None,
+                "mean_concurrency": num_requests,
+                "median_concurrency": num_requests,
+                "std_dev_concurrency": 0.0,
+            },
+        )
 
-def test_running_stats_add():
-    running_stats = RunningStats()
-    mean = running_stats + 1
-    assert mean == 1
-    assert mean == running_stats.mean
-    assert running_stats.count == 1
-    assert running_stats.total == 1
-    assert running_stats.last == 1
-
-
-def test_running_stats_iadd():
-    running_stats = RunningStats()
-    running_stats += 1
-    assert running_stats.count == 1
-    assert running_stats.total == 1
-    assert running_stats.last == 1
-    assert running_stats.mean == 1
-
-
-def test_time_running_stats_initialization():
-    time_running_stats = TimeRunningStats()
-    assert time_running_stats.start_time == pytest.approx(time.time(), abs=0.01)
-    assert time_running_stats.count == 0
-    assert time_running_stats.total == 0
-    assert time_running_stats.last == 0
-    assert time_running_stats.mean == 0
-    assert time_running_stats.rate == 0
-    assert time_running_stats.total_ms == 0
-    assert time_running_stats.last_ms == 0
-    assert time_running_stats.mean_ms == 0
-    assert time_running_stats.rate_ms == 0
-
-
-def test_time_running_stats_marshalling():
-    time_running_stats = TimeRunningStats()
-    serialized = time_running_stats.model_dump()
-    deserialized = TimeRunningStats.model_validate(serialized)
-
-    for key, value in vars(time_running_stats).items():
-        assert getattr(deserialized, key) == value
-
-
-def test_time_running_stats_update():
-    time_running_stats = TimeRunningStats()
-    time_running_stats.update(1)
-    assert time_running_stats.count == 1
-    assert time_running_stats.total == 1
-    assert time_running_stats.last == 1
-    assert time_running_stats.mean == 1
-    assert time_running_stats.total_ms == 1000
-    assert time_running_stats.last_ms == 1000
-    assert time_running_stats.mean_ms == 1000
-    time.sleep(1.0)
-    assert time_running_stats.rate == pytest.approx(
-        1.0 / (time.time() - time_running_stats.start_time), abs=0.1
-    )
-    assert time_running_stats.rate_ms == pytest.approx(
-        1000 / (time.time() - time_running_stats.start_time), abs=0.1
-    )
+    if concurrency_type == "constant_rate":
+        request_duration = (end_time - start_time) / 10
+        timings = np.linspace(start_time, end_time - request_duration, num_requests)
+        requests = np.column_stack((timings, timings + request_duration))
+        request_delay = timings[1] - timings[0]
+        rate = 1 / request_delay
+        concurrency = rate * request_duration
 
-    time_running_stats.update(2)
-    assert time_running_stats.count == 2
-    assert time_running_stats.total == 3
-    assert time_running_stats.last == 2
-    assert time_running_stats.mean == 1.5
-    assert time_running_stats.total_ms == 3000
-    assert time_running_stats.last_ms == 2000
-    assert time_running_stats.mean_ms == 1500
-    time.sleep(1)
-    assert time_running_stats.rate == pytest.approx(
-        3 / (time.time() - time_running_stats.start_time), abs=0.1
-    )
-    assert time_running_stats.rate_ms == pytest.approx(
-        3000 / (time.time() - time_running_stats.start_time), abs=0.1
-    )
+        return (
+            concurrency_type,
+            requests,
+            {
+                "start_time": request_delay * concurrency,
+                "end_time": end_time - request_delay * concurrency,
+                "mean_concurrency": concurrency,
+                "median_concurrency": concurrency,
+                "std_dev_concurrency": 0.0,
+            },
+        )
 
+    if concurrency_type == "burst":
+        request_length = (end_time - start_time) / 10
+        requests = np.column_stack(
+            (
+                np.repeat(start_time, num_requests),
+                np.repeat(start_time + request_length, num_requests),
+            )
+        )
 
-@pytest.mark.regression
-def test_distribution_summary_concurrency_double_counting_regression():
-    """Specific regression test for the double-counting bug in concurrency calculation.
+        fraction_active = request_length / (end_time - start_time)
+        mean_concurrency_windowed = num_requests * fraction_active
+        median_concurrency_windowed = 0.0 if fraction_active < 0.5 else num_requests
+        variance = (
+            fraction_active * (num_requests - mean_concurrency_windowed) ** 2
+            + (1 - fraction_active) * mean_concurrency_windowed**2
+        )
+        std_dev_concurrency_windowed = variance**0.5
 
-    Before the fix, when events were merged due to epsilon, the deltas were summed
-    but then the active count wasn't properly accumulated, causing incorrect results.
+        return (
+            concurrency_type,
+            requests,
+            {
+                "start_time": start_time,
+                "end_time": end_time,
+                "mean_concurrency": mean_concurrency_windowed,
+                "median_concurrency": median_concurrency_windowed,
+                "std_dev_concurrency": std_dev_concurrency_windowed,
+            },
+        )
 
-    ### WRITTEN BY AI ###
-    """
-    epsilon = 1e-6
-
-    # Create a scenario where multiple requests start at exactly the same time
-    # This should result in events being merged, testing the accumulation logic
-    same_start_time = 1.0
-    requests = [
-        (same_start_time, 3.0),
-        (same_start_time, 4.0),
-        (same_start_time, 5.0),
-        (same_start_time + epsilon / 3, 6.0),  # Very close start (within epsilon)
-    ]
+    if concurrency_type == "triangular_ramp":
+        max_concurrency = num_requests
+        ramp_up_time = (end_time - start_time) / 2
+        request_duration = ramp_up_time
+        timings = np.linspace(start_time, start_time + ramp_up_time, max_concurrency)
+        requests = np.column_stack((timings, timings + request_duration))
 
-    distribution_summary = DistributionSummary.from_request_times(
-        requests, distribution_type="concurrency", epsilon=epsilon
-    )
+        return (
+            concurrency_type,
+            requests,
+            {
+                "start_time": None,
+                "end_time": None,
+                "mean_concurrency": max_concurrency / 2,
+                "median_concurrency": max_concurrency / 2,
+                "std_dev_concurrency": max_concurrency / (2 * math.sqrt(3)),
+            },
+        )
 
-    # All requests start at the same time (or within epsilon), so they should
-    # all be considered concurrent from the start
-    # Expected timeline:
-    # - t=1.0-3.0: 4 concurrent requests
-    # - t=3.0-4.0: 3 concurrent requests
-    # - t=4.0-5.0: 2 concurrent requests
-    # - t=5.0-6.0: 1 concurrent request
+    return None
+
+
+class TestPercentiles:
+    @pytest.fixture
+    def valid_instances(
+        self,
+        probability_distributions: tuple[
+            str | None, np.ndarray, np.ndarray, dict[str, float]
+        ],
+    ) -> tuple[Percentiles, str | None, np.ndarray, np.ndarray, dict[str, float]]:
+        dist_type, samples, pdf, stats = probability_distributions
+        instance = Percentiles(
+            p001=stats["percentiles"]["p001"],
+            p01=stats["percentiles"]["p01"],
+            p05=stats["percentiles"]["p05"],
+            p10=stats["percentiles"]["p10"],
+            p25=stats["percentiles"]["p25"],
+            p50=stats["percentiles"]["p50"],
+            p75=stats["percentiles"]["p75"],
+            p90=stats["percentiles"]["p90"],
+            p95=stats["percentiles"]["p95"],
+            p99=stats["percentiles"]["p99"],
+            p999=stats["percentiles"]["p999"],
+        )
+        return instance, dist_type, samples, pdf, stats
+
+    @pytest.mark.smoke
+    def test_class_signatures(self):
+        assert issubclass(Percentiles, BaseModel)
+        assert "p001" in Percentiles.model_fields
+        assert "p01" in Percentiles.model_fields
+        assert "p05" in Percentiles.model_fields
+        assert "p10" in Percentiles.model_fields
+        assert "p25" in Percentiles.model_fields
+        assert "p50" in Percentiles.model_fields
+        assert "p75" in Percentiles.model_fields
+        assert "p90" in Percentiles.model_fields
+        assert "p95" in Percentiles.model_fields
+        assert "p99" in Percentiles.model_fields
+        assert "p999" in Percentiles.model_fields
+        assert hasattr(Percentiles, "from_pdf")
+
+    @pytest.mark.smoke
+    def test_initialization(
+        self,
+        valid_instances: tuple[
+            DistributionSummary, Percentiles, np.ndarray, np.ndarray, dict[str, float]
+        ],
+    ):
+        instance, _dist_type, _samples, _pdf, stats = valid_instances
+        assert isinstance(instance, Percentiles)
+        assert instance.p001 == stats["percentiles"]["p001"], "p001 percentile mismatch"
+        assert instance.p01 == stats["percentiles"]["p01"], "p01 percentile mismatch"
+        assert instance.p05 == stats["percentiles"]["p05"], "p05 percentile mismatch"
+        assert instance.p10 == stats["percentiles"]["p10"], "p10 percentile mismatch"
+        assert instance.p25 == stats["percentiles"]["p25"], "p25 percentile mismatch"
+        assert instance.p50 == stats["percentiles"]["p50"], "p50 percentile mismatch"
+        assert instance.p75 == stats["percentiles"]["p75"], "p75 percentile mismatch"
+        assert instance.p90 == stats["percentiles"]["p90"], "p90 percentile mismatch"
+        assert instance.p95 == stats["percentiles"]["p95"], "p95 percentile mismatch"
+        assert instance.p99 == stats["percentiles"]["p99"], "p99 percentile mismatch"
+        assert instance.p999 == stats["percentiles"]["p999"], "p999 percentile mismatch"
+
+    @pytest.mark.sanity
+    @pytest.mark.parametrize(
+        "missing_field",
+        ["p001", "p01", "p05", "p10", "p25", "p50", "p75", "p90", "p95", "p99", "p999"],
+    )
+    def test_invalid_initialization(self, missing_field):
+        test_kwargs = {
+            "p001": 0.1,
+            "p01": 1.0,
+            "p05": 5.0,
+            "p10": 10.0,
+            "p25": 25.0,
+            "p50": 50.0,
+            "p75": 75.0,
+            "p90": 90.0,
+            "p95": 95.0,
+            "p99": 99.0,
+            "p999": 99.9,
+        }
+        del test_kwargs[missing_field]
+
+        with pytest.raises(ValidationError):
+            Percentiles(**test_kwargs)
+
+    @pytest.mark.smoke
+    def test_from_pdf(self, valid_instances):
+        _instance, _dist_type, _values, pdf, stats = valid_instances
+
+        tolerance = 0.1 * abs(stats["std_dev"])  # within 10% of standard deviation
+        percentiles = Percentiles.from_pdf(pdf)
+        assert percentiles.p001 == pytest.approx(
+            stats["percentiles"]["p001"], abs=tolerance
+        ), "p001 percentile mismatch"
+        assert percentiles.p01 == pytest.approx(
+            stats["percentiles"]["p01"], abs=tolerance
+        ), "p01 percentile mismatch"
+        assert percentiles.p05 == pytest.approx(
+            stats["percentiles"]["p05"], abs=tolerance
+        ), "p05 percentile mismatch"
+        assert percentiles.p10 == pytest.approx(
+            stats["percentiles"]["p10"], abs=tolerance
+        ), "p10 percentile mismatch"
+        assert percentiles.p25 == pytest.approx(
+            stats["percentiles"]["p25"], abs=tolerance
+        ), "p25 percentile mismatch"
+        assert percentiles.p50 == pytest.approx(
+            stats["percentiles"]["p50"], abs=tolerance
+        ), "p50 percentile mismatch"
+        assert percentiles.p75 == pytest.approx(
+            stats["percentiles"]["p75"], abs=tolerance
+        ), "p75 percentile mismatch"
+        assert percentiles.p90 == pytest.approx(
+            stats["percentiles"]["p90"], abs=tolerance
+        ), "p90 percentile mismatch"
+        assert percentiles.p95 == pytest.approx(
+            stats["percentiles"]["p95"], abs=tolerance
+        ), "p95 percentile mismatch"
+        assert percentiles.p99 == pytest.approx(
+            stats["percentiles"]["p99"], abs=tolerance
+        ), "p99 percentile mismatch"
+        assert percentiles.p999 == pytest.approx(
+            stats["percentiles"]["p999"], abs=(tolerance * 2)
+        ), "p999 percentile mismatch"
+
+    @pytest.mark.sanity
+    @pytest.mark.parametrize(
+        ("pdf", "error_match"),
+        [
+            (np.array([1, 2, 3]), "must be a 2D array"),
+            (np.array([[1, 2, 3]]), "must be a 2D array"),
+            (np.array([[1.0, -0.5], [2.0, 0.5]]), "must be non-negative"),
+            (np.array([[1.0, 0.3], [2.0, 0.5]]), "must sum to 1"),
+        ],
+    )
+    def test_from_pdf_invalid(self, pdf, error_match):
+        with pytest.raises(ValueError, match=error_match):
+            Percentiles.from_pdf(pdf)
+
+    @pytest.mark.smoke
+    def test_marshalling(self, valid_instances):
+        instance, _dist_type, _values, _pdf, stats = valid_instances
+        data_dict = instance.model_dump()
+        assert isinstance(data_dict, dict)
+        for param in stats["percentiles"]:
+            assert param in data_dict
+            assert data_dict[param] == getattr(instance, param)
+
+        recreated = Percentiles.model_validate(data_dict)
+        assert isinstance(recreated, Percentiles)
+        for param in stats["percentiles"]:
+            assert getattr(recreated, param) == getattr(instance, param)
+
+
+class TestDistributionSummary:
+    @pytest.fixture
+    def valid_instances(
+        self,
+        probability_distributions: tuple[
+            str | None, np.ndarray, np.ndarray, dict[str, float]
+        ],
+    ) -> tuple[
+        DistributionSummary, str | None, np.ndarray, np.ndarray, dict[str, float]
+    ]:
+        dist_type, samples, pdf, stats = probability_distributions
+        instance = DistributionSummary(
+            mean=stats["mean"],
+            median=stats["median"],
+            mode=0.0,
+            variance=stats["variance"],
+            std_dev=stats["std_dev"],
+            min=stats["min"],
+            max=stats["max"],
+            count=stats["count"],
+            total_sum=stats["total_sum"],
+            percentiles=Percentiles(**stats["percentiles"]),
+            pdf=pdf,
+        )
 
-    assert distribution_summary.max == 4.0  # All 4 requests concurrent at start
-    assert distribution_summary.min == 1.0  # 1 request still running at the end
+        return instance, dist_type, samples, pdf, stats
+
+    @pytest.mark.smoke
+    def test_class_signatures(self):
+        assert issubclass(DistributionSummary, BaseModel)
+        assert "mean" in DistributionSummary.model_fields
+        assert "median" in DistributionSummary.model_fields
+        assert "mode" in DistributionSummary.model_fields
+        assert "variance" in DistributionSummary.model_fields
+        assert "std_dev" in DistributionSummary.model_fields
+        assert "min" in DistributionSummary.model_fields
+        assert "max" in DistributionSummary.model_fields
+        assert "count" in DistributionSummary.model_fields
+        assert "total_sum" in DistributionSummary.model_fields
+        assert "percentiles" in DistributionSummary.model_fields
+        assert "pdf" in DistributionSummary.model_fields
+        assert hasattr(DistributionSummary, "from_pdf")
+        assert hasattr(DistributionSummary, "from_values")
+        assert hasattr(DistributionSummary, "rate_distribution_from_timings")
+        assert hasattr(DistributionSummary, "concurrency_distribution_from_timings")
+
+    @pytest.mark.smoke
+    def test_initialization(
+        self,
+        valid_instances: tuple[
+            DistributionSummary, str | None, np.ndarray, np.ndarray, dict[str, float]
+        ],
+    ):
+        instance, _dist_type, _samples, _pdf, stats = valid_instances
+        assert instance.mean == stats["mean"]
+        assert instance.median == stats["median"]
+        assert instance.variance == stats["variance"]
+        assert instance.std_dev == stats["std_dev"]
+        assert instance.min == stats["min"]
+        assert instance.max == stats["max"]
+        assert instance.count == stats["count"]
+        assert instance.total_sum == stats["total_sum"]
+        assert isinstance(instance.percentiles, Percentiles)
+        for param in stats["percentiles"]:
+            assert getattr(instance.percentiles, param) == stats["percentiles"][param]
+        assert instance.pdf is None or isinstance(instance.pdf, list)
+
+    @pytest.mark.sanity
+    @pytest.mark.parametrize(
+        "missing_field",
+        [
+            "mean",
+            "median",
+            "mode",
+            "variance",
+            "std_dev",
+            "min",
+            "max",
+            "count",
+            "total_sum",
+            "percentiles",
+        ],
+    )
+    def test_invalid_initialization(self, missing_field):
+        test_kwargs = {
+            "mean": 50.0,
+            "median": 50.0,
+            "mode": 50.0,
+            "variance": 835.0,
+            "std_dev": math.sqrt(835.0),
+            "min": 0.0,
+            "max": 100.0,
+            "count": 1001,
+            "total_sum": 50050.0,
+            "percentiles": Percentiles(
+                p001=0.1,
+                p01=1.0,
+                p05=5.0,
+                p10=10.0,
+                p25=25.0,
+                p50=50.0,
+                p75=75.0,
+                p90=90.0,
+                p95=95.0,
+                p99=99.0,
+                p999=99.9,
+            ),
+        }
+        del test_kwargs[missing_field]
+
+        with pytest.raises(ValidationError):
+            DistributionSummary(**test_kwargs)
+
+    @pytest.mark.smoke
+    @pytest.mark.parametrize("include_pdf", [False, True])
+    def test_from_pdf(
+        self,
+        valid_instances: tuple[
+            DistributionSummary, str | None, np.ndarray, np.ndarray, dict[str, float]
+        ],
+        include_pdf: bool | int,
+    ):
+        _instance, _dist_type, values, pdf, stats = valid_instances
+
+        tolerance = 0.1 * abs(stats["std_dev"])  # within 10% of standard deviation
+        summary = DistributionSummary.from_pdf(pdf, include_pdf=include_pdf)
+        assert summary.mean == pytest.approx(stats["mean"], abs=tolerance), (
+            "mean mismatch"
+        )
+        assert summary.median == pytest.approx(stats["median"], abs=tolerance), (
+            "median mismatch"
+        )
+        assert summary.variance == pytest.approx(stats["variance"], abs=tolerance), (
+            "variance mismatch"
+        )
+        assert summary.std_dev == pytest.approx(stats["std_dev"], abs=tolerance), (
+            "std_dev mismatch"
+        )
+        assert summary.min == pytest.approx(stats["min"], abs=tolerance), "min mismatch"
+        assert summary.max == pytest.approx(stats["max"], abs=tolerance), "max mismatch"
+        assert summary.count == stats["count"], "count mismatch"
+        assert summary.total_sum == pytest.approx(stats["total_sum"], abs=tolerance), (
+            "total_sum mismatch"
+        )
+        assert isinstance(summary.percentiles, Percentiles)
+        for param in stats["percentiles"]:
+            assert getattr(summary.percentiles, param) == pytest.approx(
+                stats["percentiles"][param],
+                abs=tolerance if param != "p999" else (tolerance * 2),
+            ), f"{param} percentile mismatch"
+
+        if include_pdf is False:
+            assert summary.pdf is None
+        elif include_pdf is True:
+            assert summary.pdf is not None
+            assert isinstance(summary.pdf, list)
+            assert len(summary.pdf) == len(pdf)
+
+    @pytest.mark.smoke
+    @pytest.mark.parametrize("include_pdf", [False, True])
+    def test_from_values(
+        self,
+        valid_instances: tuple[
+            DistributionSummary, str | None, np.ndarray, np.ndarray, dict[str, float]
+        ],
+        include_pdf: bool | int,
+    ):
+        _instance, _dist_type, values, _pdf, stats = valid_instances
+
+        tolerance = 0.1 * abs(stats["std_dev"])  # within 10% of standard deviation
+        summary = DistributionSummary.from_values(values, include_pdf=include_pdf)
+        assert summary.mean == pytest.approx(stats["mean"], abs=tolerance), (
+            "mean mismatch"
+        )
+        assert summary.median == pytest.approx(stats["median"], abs=tolerance), (
+            "median mismatch"
+        )
+        assert summary.variance == pytest.approx(stats["variance"], abs=tolerance), (
+            "variance mismatch"
+        )
+        assert summary.std_dev == pytest.approx(stats["std_dev"], abs=tolerance), (
+            "std_dev mismatch"
+        )
+        assert summary.min == pytest.approx(stats["min"], abs=tolerance), "min mismatch"
+        assert summary.max == pytest.approx(stats["max"], abs=tolerance), "max mismatch"
+        assert summary.count == stats["count"], "count mismatch"
+        assert summary.total_sum == pytest.approx(stats["total_sum"], abs=tolerance), (
+            "total_sum mismatch"
+        )
+        assert isinstance(summary.percentiles, Percentiles)
+        for param in stats["percentiles"]:
+            assert getattr(summary.percentiles, param) == pytest.approx(
+                stats["percentiles"][param],
+                abs=tolerance if param != "p999" else (tolerance * 2),
+            ), f"{param} percentile mismatch"
+
+        if include_pdf is False:
+            assert summary.pdf is None
+        elif include_pdf is True:
+            assert summary.pdf is not None
+            assert isinstance(summary.pdf, list)
+            assert len(summary.pdf) > 0 if len(values) > 0 else len(summary.pdf) == 0
+
+    @pytest.mark.smoke
+    @pytest.mark.parametrize(
+        ("limit_start_time", "limit_end_time", "include_pdf"),
+        [
+            (False, False, False),
+            (True, False, True),
+            (False, True, False),
+            (True, True, True),
+        ],
+    )
+    def test_rate_distribution_from_timings(
+        self,
+        valid_instances: tuple[
+            DistributionSummary, str | None, np.ndarray, np.ndarray, dict[str, float]
+        ],
+        limit_start_time: bool,
+        limit_end_time: bool,
+        include_pdf: bool | int,
+    ):
+        _instance, dist_type, _values, pdf, stats = valid_instances
+
+        if dist_type in ("exponential", "poisson"):
+            pytest.skip(
+                f"Skipping rate distribution test for {dist_type} distribution "
+                "due to inherent variability and incompatibility with rate assumptions."
+            )
+
+        rng = np.random.default_rng(seed=42)
+
+        if len(pdf) > 0:
+            # The PDF gives the expected distribution for the rates
+            # So, we can use it to sample individual, instantaneous rates
+            # and convert those to timings by inverting and accumulating
+            sampled_rates = rng.choice(pdf[:, 0], size=100000, p=pdf[:, 1])
+            delta_times = 1.0 / np.clip(sampled_rates, a_min=1e-6, a_max=None)
+            timings = np.cumsum(delta_times)
+        else:
+            timings = np.array([])
+
+        # Now, compute the rate distribution from the timings and compare
+        start_time = stats["mean"] if limit_start_time and len(timings) > 0 else None
+        end_time = (
+            np.max(timings) - stats["mean"]
+            if limit_end_time and len(timings) > 0
+            else None
+        )
+        distribution = DistributionSummary.rate_distribution_from_timings(
+            timings, start_time=start_time, end_time=end_time, include_pdf=include_pdf
+        )
 
+        # Check expected nearly exact values (mean and count)
+        expected_rate = (
+            len(timings) / (timings[-1] - timings[0]) if len(timings) > 1 else 0.0
+        )
+        assert distribution.mean == pytest.approx(expected_rate, rel=10e-4), (
+            "expected mean rate mismatch"
+        )
+        expected_count = len(timings)
+        if start_time and len(timings) > 0:
+            expected_count -= len(timings[timings < start_time])
+        if end_time and len(timings) > 0:
+            expected_count -= len(timings[timings > end_time])
+        assert distribution.count == expected_count, "expected count mismatch"
+
+        # Loosely validate against original stats (randomness in sampling)
+        tolerance = 0.5 * abs(stats["std_dev"])  # within 10% of standard deviation
+        assert distribution.mean == pytest.approx(stats["mean"], abs=tolerance), (
+            "mean mismatch"
+        )
+        assert distribution.median == pytest.approx(stats["median"], abs=tolerance), (
+            "median mismatch"
+        )
+        assert distribution.std_dev == pytest.approx(stats["std_dev"], abs=tolerance), (
+            "std_dev mismatch"
+        )
 
-@pytest.mark.sanity
-def test_distribution_summary_concurrency_epsilon_edge_case():
-    """Test the exact epsilon boundary condition.
+    @pytest.mark.smoke
+    @pytest.mark.parametrize(
+        ("concurrency_type", "include_pdf"),
+        [
+            ("sequential", False),
+            ("parallel", True),
+            ("constant_rate", False),
+            ("burst", True),
+            ("triangular_ramp", False),
+        ],
+    )
+    def test_concurrency_distribution_from_timings(self, concurrency_type, include_pdf):
+        (
+            _concurrency_type,
+            requests,
+            stats,
+        ) = concurrency_distributions(concurrency_type, num_requests=1000)
 
-    ### WRITTEN BY AI ###
-    """
-    epsilon = 1e-6
+        distribution = DistributionSummary.concurrency_distribution_from_timings(
+            requests,
+            start_time=stats["start_time"],
+            end_time=stats["end_time"],
+            include_pdf=include_pdf,
+        )
 
-    # Test requests that are exactly epsilon apart - should be merged
-    requests_exactly_epsilon = [
-        (1.0, 2.0),
-        (1.0 + epsilon, 2.5),  # Exactly epsilon apart
-        (2.0, 2.5),  # Another close request
-    ]
+        assert distribution.mean == pytest.approx(
+            stats["mean_concurrency"], rel=1e-2
+        ), "mean concurrency mismatch"
+        assert distribution.median == pytest.approx(
+            stats["median_concurrency"], rel=1e-2
+        ), "median concurrency mismatch"
+        assert distribution.std_dev == pytest.approx(
+            stats["std_dev_concurrency"], rel=1e-2
+        ), "std_dev concurrency mismatch"
+
+    @pytest.mark.smoke
+    def test_marshalling(self, valid_instances):
+        instance, _dist_type, _values, _pdf, stats = valid_instances
+        data_dict = instance.model_dump()
+        assert isinstance(data_dict, dict)
+        for param in [
+            "mean",
+            "median",
+            "mode",
+            "variance",
+            "std_dev",
+            "min",
+            "max",
+            "count",
+            "total_sum",
+            "percentiles",
+            "pdf",
+        ]:
+            assert param in data_dict
+            if param == "percentiles":
+                for p_param in stats["percentiles"]:
+                    assert (
+                        getattr(instance.percentiles, p_param)
+                        == data_dict["percentiles"][p_param]
+                    )
+            else:
+                assert data_dict[param] == getattr(instance, param)
+
+        recreated = DistributionSummary.model_validate(data_dict)
+        assert isinstance(recreated, DistributionSummary)
+        for param in [
+            "mean",
+            "median",
+            "mode",
+            "variance",
+            "std_dev",
+            "min",
+            "max",
+            "count",
+            "total_sum",
+            "percentiles",
+            "pdf",
+        ]:
+            if param == "percentiles":
+                for p_param in stats["percentiles"]:
+                    assert getattr(recreated.percentiles, p_param) == getattr(
+                        instance.percentiles, p_param
+                    )
+            else:
+                assert getattr(recreated, param) == getattr(instance, param)
+
+    @pytest.mark.sanity
+    @pytest.mark.parametrize(
+        ("values", "error_type"),
+        [
+            ("not_a_list", ValueError),
+            ({"invalid": "dict"}, ValueError),
+            (None, ValueError),
+        ],
+    )
+    def test_from_values_invalid_input(self, values, error_type):
+        """Test DistributionSummary.from_values with invalid input types."""
+        with pytest.raises(error_type):
+            DistributionSummary.from_values(values)
+
+    @pytest.mark.sanity
+    @pytest.mark.parametrize(
+        ("pdf", "error_match"),
+        [
+            (np.array([1, 2, 3]), "must be a 2D array"),
+            (np.array([[1, 2, 3]]), "must be a 2D array"),
+            (np.array([[1.0, -0.5], [2.0, 0.5]]), "must be non-negative"),
+            (np.array([[1.0, 0.3], [2.0, 0.5]]), "must sum to 1"),
+        ],
+    )
+    def test_from_pdf_invalid(self, pdf, error_match):
+        """Test DistributionSummary.from_pdf with invalid PDFs."""
+        with pytest.raises(ValueError, match=error_match):
+            DistributionSummary.from_pdf(pdf)
+
+    @pytest.mark.sanity
+    def test_from_values_with_weights(self):
+        """Test DistributionSummary.from_values with weighted values."""
+        # Values with weights: (value, weight)
+        values = [(1.0, 2.0), (2.0, 1.0), (3.0, 1.0)]
+        summary = DistributionSummary.from_values(values)
+
+        assert isinstance(summary, DistributionSummary)
+        # Count is sum of weights: 2 + 1 + 1 = 4
+        assert summary.count == 4
+        # Mean should be weighted: (1*2 + 2*1 + 3*1) / (2+1+1) = 7/4 = 1.75
+        assert summary.mean == pytest.approx(1.75, abs=0.01)
+
+    @pytest.mark.sanity
+    def test_rate_distribution_empty_timings(self):
+        """Test rate_distribution_from_timings with empty input."""
+        summary = DistributionSummary.rate_distribution_from_timings([])
+        assert summary.count == 0
+        assert summary.mean == 0.0
+
+    @pytest.mark.sanity
+    def test_concurrency_distribution_empty_intervals(self):
+        """Test concurrency_distribution_from_timings with empty input."""
+        summary = DistributionSummary.concurrency_distribution_from_timings([])
+        assert summary.count == 0
+        assert summary.mean == 0.0
+
+    @pytest.mark.sanity
+    def test_rate_distribution_single_event(self):
+        """Test rate_distribution_from_timings with single event."""
+        summary = DistributionSummary.rate_distribution_from_timings([1.0])
+        # Single event results in no rates (need at least 2 for intervals)
+        assert summary.count == 0
+        assert summary.mean == 0.0
+
+    @pytest.mark.sanity
+    def test_concurrency_with_weighted_intervals(self):
+        """Test concurrency_distribution_from_timings with weighted intervals."""
+        # Intervals with weights: (start, end, weight)
+        intervals = [(0.0, 10.0, 2.0), (5.0, 15.0, 1.0)]
+        summary = DistributionSummary.concurrency_distribution_from_timings(intervals)
+
+        assert isinstance(summary, DistributionSummary)
+        assert summary.count == 2
+
+
+class TestStatusDistributionSummary:
+    @pytest.fixture(
+        params=[
+            {
+                "successful": [1.0, 2.0, 3.0],
+                "incomplete": [4.0, 5.0],
+                "errored": [6.0],
+            },
+            {
+                "successful": np.array([10.0, 20.0, 30.0, 40.0]),
+                "incomplete": np.array([50.0]),
+                "errored": np.array([]),
+            },
+            {
+                "successful": [],
+                "incomplete": [],
+                "errored": [],
+            },
+        ]
+    )
+    def valid_instances(
+        self,
+        request,
+    ) -> tuple[StatusDistributionSummary, dict[str, list[float] | np.ndarray]]:
+        """Fixture providing test data for StatusDistributionSummary."""
+        test_data = request.param
+        instance = StatusDistributionSummary.from_values(
+            successful=test_data["successful"],
+            incomplete=test_data["incomplete"],
+            errored=test_data["errored"],
+        )
+        return instance, test_data
+
+    @pytest.mark.smoke
+    def test_class_signatures(self):
+        """Test StatusDistributionSummary class structure and methods."""
+        assert hasattr(StatusDistributionSummary, "from_values")
+        assert hasattr(StatusDistributionSummary, "rate_distribution_from_timings")
+        assert hasattr(
+            StatusDistributionSummary, "concurrency_distribution_from_timings"
+        )
+        assert "total" in StatusDistributionSummary.model_fields
+        assert "successful" in StatusDistributionSummary.model_fields
+        assert "incomplete" in StatusDistributionSummary.model_fields
+        assert "errored" in StatusDistributionSummary.model_fields
+
+    @pytest.mark.smoke
+    def test_initialization(
+        self,
+        valid_instances: tuple[
+            StatusDistributionSummary, dict[str, list[float] | np.ndarray]
+        ],
+    ):
+        """Test StatusDistributionSummary initialization."""
+        instance, test_data = valid_instances
+        assert isinstance(instance, StatusDistributionSummary)
+        assert isinstance(instance.total, DistributionSummary)
+        assert isinstance(instance.successful, DistributionSummary)
+        assert isinstance(instance.incomplete, DistributionSummary)
+        assert isinstance(instance.errored, DistributionSummary)
+
+        # Verify counts match expected
+        successful_count = (
+            len(test_data["successful"])
+            if isinstance(test_data["successful"], list)
+            else test_data["successful"].shape[0]
+        )
+        incomplete_count = (
+            len(test_data["incomplete"])
+            if isinstance(test_data["incomplete"], list)
+            else test_data["incomplete"].shape[0]
+        )
+        errored_count = (
+            len(test_data["errored"])
+            if isinstance(test_data["errored"], list)
+            else test_data["errored"].shape[0]
+        )
 
-    dist_epsilon = DistributionSummary.from_request_times(
-        requests_exactly_epsilon, distribution_type="concurrency", epsilon=epsilon
-    )
+        assert instance.successful.count == successful_count
+        assert instance.incomplete.count == incomplete_count
+        assert instance.errored.count == errored_count
+        assert (
+            instance.total.count == successful_count + incomplete_count + errored_count
+        )
 
-    # Should be treated as concurrent (merged events)
-    assert dist_epsilon.max == 2.0
-    assert dist_epsilon.min == 2.0
+    @pytest.mark.sanity
+    @pytest.mark.parametrize(
+        ("field", "value"),
+        [
+            ("successful", "invalid_string"),
+            ("incomplete", 123),
+            ("errored", [1, 2, 3]),
+            ("total", {"dict": "value"}),
+        ],
+    )
+    def test_invalid_initialization(self, field, value):
+        """Test StatusDistributionSummary with invalid field types."""
+        test_kwargs = {
+            "successful": DistributionSummary.from_values([1.0, 2.0]),
+            "incomplete": DistributionSummary.from_values([3.0]),
+            "errored": DistributionSummary.from_values([]),
+            "total": DistributionSummary.from_values([1.0, 2.0, 3.0]),
+        }
+        test_kwargs[field] = value
+
+        with pytest.raises(ValidationError):
+            StatusDistributionSummary(**test_kwargs)
+
+    @pytest.mark.smoke
+    @pytest.mark.parametrize("include_pdf", [False, True])
+    def test_from_values(
+        self,
+        valid_instances: tuple[
+            StatusDistributionSummary, dict[str, list[float] | np.ndarray]
+        ],
+        include_pdf: bool | int,
+    ):
+        """Test creating StatusDistributionSummary from values."""
+        _instance, test_data = valid_instances
+
+        summary = StatusDistributionSummary.from_values(
+            successful=test_data["successful"],
+            incomplete=test_data["incomplete"],
+            errored=test_data["errored"],
+            include_pdf=include_pdf,
+        )
 
-    # Test requests that are just over epsilon apart - should NOT be merged
-    requests_over_epsilon = [
-        (1.0, 2.0),
-        (1.0 + epsilon * 1.1, 2.5),  # Just over epsilon apart
-        (2.0, 2.5),  # Another close request
-    ]
+        assert isinstance(summary, StatusDistributionSummary)
+        assert isinstance(summary.total, DistributionSummary)
+        assert isinstance(summary.successful, DistributionSummary)
+        assert isinstance(summary.incomplete, DistributionSummary)
+        assert isinstance(summary.errored, DistributionSummary)
+
+        if include_pdf is False:
+            assert summary.total.pdf is None
+            assert summary.successful.pdf is None
+            assert summary.incomplete.pdf is None
+            assert summary.errored.pdf is None
+        elif include_pdf is True:
+            assert summary.total.pdf is not None or summary.total.count == 0
+            assert summary.successful.pdf is not None or summary.successful.count == 0
+            assert summary.incomplete.pdf is not None or summary.incomplete.count == 0
+            assert summary.errored.pdf is not None or summary.errored.count == 0
+
+    @pytest.mark.smoke
+    @pytest.mark.parametrize(
+        ("limit_start_time", "limit_end_time", "include_pdf"),
+        [
+            (False, False, False),
+            (True, False, True),
+            (False, True, False),
+            (True, True, True),
+        ],
+    )
+    def test_rate_distribution_from_timings(
+        self,
+        limit_start_time: bool,
+        limit_end_time: bool,
+        include_pdf: bool | int,
+    ):
+        """Test creating rate distribution from timings by status."""
+        rng = np.random.default_rng(seed=42)
+        successful_times = rng.uniform(0, 100, 50).tolist()
+        incomplete_times = rng.uniform(0, 100, 20).tolist()
+        errored_times = rng.uniform(0, 100, 10).tolist()
+
+        start_time = 25.0 if limit_start_time else None
+        end_time = 75.0 if limit_end_time else None
+
+        summary = StatusDistributionSummary.rate_distribution_from_timings(
+            successful=successful_times,
+            incomplete=incomplete_times,
+            errored=errored_times,
+            start_time=start_time,
+            end_time=end_time,
+            include_pdf=include_pdf,
+        )
 
-    dist_over_epsilon = DistributionSummary.from_request_times(
-        requests_over_epsilon, distribution_type="concurrency", epsilon=epsilon
-    )
+        assert isinstance(summary, StatusDistributionSummary)
+        assert isinstance(summary.total, DistributionSummary)
+        assert isinstance(summary.successful, DistributionSummary)
+        assert isinstance(summary.incomplete, DistributionSummary)
+        assert isinstance(summary.errored, DistributionSummary)
+
+        # Verify counts are reasonable
+        assert summary.total.count >= 0
+        assert summary.successful.count >= 0
+        assert summary.incomplete.count >= 0
+        assert summary.errored.count >= 0
+
+    @pytest.mark.smoke
+    @pytest.mark.parametrize(
+        "include_pdf",
+        [
+            False,
+            True,
+        ],
+    )
+    def test_concurrency_distribution_from_timings(self, include_pdf: bool | int):
+        """Test creating concurrency distribution from intervals by status."""
+        rng = np.random.default_rng(seed=42)
+        num_successful = 30
+        num_incomplete = 10
+        num_errored = 5
+
+        # Generate realistic intervals (start, end)
+        successful_starts = rng.uniform(0, 80, num_successful)
+        successful_intervals = [
+            (start, start + rng.uniform(1, 20)) for start in successful_starts
+        ]
+
+        incomplete_starts = rng.uniform(0, 80, num_incomplete)
+        incomplete_intervals = [
+            (start, start + rng.uniform(1, 20)) for start in incomplete_starts
+        ]
+
+        errored_starts = rng.uniform(0, 80, num_errored)
+        errored_intervals = [
+            (start, start + rng.uniform(1, 20)) for start in errored_starts
+        ]
+
+        summary = StatusDistributionSummary.concurrency_distribution_from_timings(
+            successful=successful_intervals,
+            incomplete=incomplete_intervals,
+            errored=errored_intervals,
+            include_pdf=include_pdf,
+        )
 
-    # These should be treated separately, so max concurrency depends on overlap
-    # At t=1.0 to 1.0+epsilon*1.1: 1 concurrent
-    # At t=1.0+epsilon*1.1 to 2.0: 2 concurrent
-    # At t=2.0 to 2.5: 1 concurrent
-    assert dist_over_epsilon.max == 2.0
-    assert dist_over_epsilon.min == 1.0
+        assert isinstance(summary, StatusDistributionSummary)
+        assert isinstance(summary.total, DistributionSummary)
+        assert isinstance(summary.successful, DistributionSummary)
+        assert isinstance(summary.incomplete, DistributionSummary)
+        assert isinstance(summary.errored, DistributionSummary)
+
+        # Verify counts match
+        assert summary.successful.count == num_successful
+        assert summary.incomplete.count == num_incomplete
+        assert summary.errored.count == num_errored
+        assert summary.total.count == num_successful + num_incomplete + num_errored
+
+    @pytest.mark.smoke
+    def test_marshalling(self, valid_instances):
+        """Test StatusDistributionSummary serialization and deserialization."""
+        instance, _test_data = valid_instances
+        data_dict = instance.model_dump()
+        assert isinstance(data_dict, dict)
+        assert "total" in data_dict
+        assert "successful" in data_dict
+        assert "incomplete" in data_dict
+        assert "errored" in data_dict
+
+        # Verify each status has distribution summary data
+        for status in ["total", "successful", "incomplete", "errored"]:
+            assert isinstance(data_dict[status], dict)
+            assert "mean" in data_dict[status]
+            assert "median" in data_dict[status]
+            assert "count" in data_dict[status]
+
+        recreated = StatusDistributionSummary.model_validate(data_dict)
+        assert isinstance(recreated, StatusDistributionSummary)
+        assert recreated.total.count == instance.total.count
+        assert recreated.successful.count == instance.successful.count
+        assert recreated.incomplete.count == instance.incomplete.count
+        assert recreated.errored.count == instance.errored.count