vllm-project · VincentG1234 · Feb 28, 2026 · Mar 11, 2026 · Mar 15, 2026 · Mar 17, 2026
diff --git a/docs/getting-started/benchmark.md b/docs/getting-started/benchmark.md
@@ -82,6 +82,14 @@ For example, setting `--max-requests 1000` with `--profile sweep` will run 1000
 
 GuideLLM supports several benchmark profiles and strategies, which are described in detail below.
 
+- `synchronous`: Runs requests one at a time (sequential)
+- `throughput`: Tests maximum throughput by running requests in parallel
+- `concurrent`: Runs a fixed number of parallel request streams
+- `constant`: Sends requests at a fixed rate per second
+- `poisson`: Sends requests following a Poisson distribution
+- `sweep`: Automatically determines optimal performance points (default)
+- `replay`: Replays requests from a trace file to reproduce real-world traffic patterns (beta)
+
 #### Synchronous Profile
 
 Runs requests one at a time (sequential).
@@ -187,6 +195,28 @@ guidellm benchmark \
 
 You can customize synthetic data generation with additional parameters such as standard deviation, minimum, and maximum values. See the [Datasets Synthetic data documentation](../guides/datasets.md#synthetic-data) for more details.
 
+### Trace Replay Benchmarking (beta)
+
+For realistic load testing, replay traffic patterns from trace files. Trace files must be JSONL with `timestamp`, `input_length`, and `output_length` fields:
+
+```json
+{"timestamp": 0, "input_length": 256, "output_length": 128}
+{"timestamp": 0.5, "input_length": 512, "output_length": 64}
+```
+
+Run with the `replay` profile:
+
+```bash
+guidellm benchmark \
+  --target "http://localhost:8000" \
+  --data "path/to/trace.jsonl" \
+  --data-args '{"type_": "trace_synthetic"}' \
+  --profile replay \
+  --rate 1.0
+```
+
+The `rate` parameter acts as a time scale: `1.0` for original speed, `2.0` for 2x faster, `0.5` for half speed.
+
 ### Working with Real Data
 
 While synthetic data is convenient for quick tests, you can benchmark with real-world data:

diff --git a/docs/guides/datasets.md b/docs/guides/datasets.md
@@ -131,6 +131,11 @@ GuideLLM supports various file formats for datasets, including text, CSV, JSON,
   {"prompt": "Hello, how are you?", "output_tokens_count": 5, "additional_column": "foo", "additional_column2": "bar"}
   {"prompt": "What is your name?", "output_tokens_count": 3, "additional_column": "baz", "additional_column2": "qux"}
   ```
+- **Trace files (`.jsonl` with `trace_synthetic` type)**: Specialized JSONL files for replay benchmarking with `timestamp`, `input_length`, and `output_length` fields. Used with `--profile replay` to reproduce production traffic patterns. See [Trace Replay Benchmarking](../getting-started/benchmark.md#trace-replay-benchmarking).
+  ```json
+  {"timestamp": 0, "input_length": 256, "output_length": 128}
+  {"timestamp": 0.5, "input_length": 512, "output_length": 64}
+  ```
 - **JSON files (`.json`)**: Where the entire dataset is represented as a JSON array of objects nested under a specific key. To surface the correct key to use, a `--data-column-mapper` argument must be passed in of `"field": "NAME"` for where the array exists. The objects should include `prompt` or other common names for the prompt which will be used as the prompt column. Additional fields can be included based on the previously mentioned aliases for the `--data-column-mapper` argument.
   ```json
   {

diff --git a/src/guidellm/benchmark/entrypoints.py b/src/guidellm/benchmark/entrypoints.py
@@ -255,6 +255,7 @@ async def resolve_request_loader(
     data_num_workers: int | None,
     random_seed: int,
     console: Console | None = None,
+    max_requests: int | None = None,
     **dataloader_kwargs: dict[str, Any] | None,
 ) -> DataLoader[GenerationRequest]:
     """
@@ -278,6 +279,7 @@ async def resolve_request_loader(
     :param data_num_workers: Number of worker processes for data loading
     :param random_seed: Seed for reproducible random operations
     :param console: Console instance for progress reporting, or None
+    :param max_requests: If set, first data source loads at most this many rows.
     :param dataloader_kwargs: Additional arguments passed to DataLoader initialization
     :return: Configured DataLoader instance for GenerationRequest objects
     :raises ValueError: If request formatter type is not registered in
@@ -314,6 +316,17 @@ async def resolve_request_loader(
         data_finalizer,
     )
 
+    # When max_requests is set, limit the first data source to that many rows at load
+    if max_requests is not None and data:
+        if max_requests < 1:
+            raise ValueError(
+                "max_requests must be >= 1 when set for data truncation, "
+                f"got {max_requests}"
+            )
+        data_args = list(data_args) if data_args else [{} for _ in data]
+        if len(data_args) >= 1:
+            data_args[0] = {**data_args[0], "max_rows": max_requests}
+
     request_loader: DataLoader[GenerationRequest] = DataLoader(
         data=data,
         data_args=data_args,
@@ -360,6 +373,7 @@ async def resolve_profile(
     max_global_error_rate: float | None,
     over_saturation: dict[str, Any] | None = None,
     console: Console | None = None,
+    data: list[Any] | None = None,
 ) -> Profile:
     """
     Resolve and configure a benchmark profile with rate and constraint settings.
@@ -381,6 +395,7 @@ async def resolve_profile(
     :param max_global_error_rate: Maximum global error rate threshold before stopping
     :param over_saturation: Over-saturation detection configuration (dict)
     :param console: Console instance for progress reporting, or None
+    :param data: Optional list of data sources.
     :return: Configured Profile instance ready for benchmarking
     :raises ValueError: If constraints are provided with a pre-configured Profile
     """
@@ -408,6 +423,7 @@ async def resolve_profile(
             random_seed=random_seed,
             rampup_duration=rampup,
             constraints={**constraints},
+            data=data,
         )
     elif constraints:
         raise ValueError(
@@ -496,24 +512,60 @@ async def benchmark_generative_text(
     processor = await resolve_processor(
         processor=args.processor, model=model, console=console
     )
-    request_loader = await resolve_request_loader(
-        data=args.data,
-        model=model,
-        data_args=args.data_args,
-        data_samples=args.data_samples,
-        processor=processor,
-        processor_args=args.processor_args,
-        data_column_mapper=args.data_column_mapper,
-        data_preprocessors=args.data_preprocessors,
-        data_preprocessors_kwargs=args.data_preprocessors_kwargs,
-        data_finalizer=args.data_finalizer,
-        data_collator=args.data_collator,
-        data_sampler=args.data_sampler,
-        data_num_workers=args.data_num_workers,
-        random_seed=args.random_seed,
-        console=console,
-        **(args.dataloader_kwargs or {}),
-    )
+
+    # Build common kwargs for resolve_profile and resolve_request_loader
+    profile_kwargs = {
+        "profile": args.profile,
+        "rate": args.rate,
+        "random_seed": args.random_seed,
+        "rampup": args.rampup,
+        "constraints": constraints,
+        "max_seconds": args.max_seconds,
+        "max_requests": args.max_requests,
+        "max_errors": args.max_errors,
+        "max_error_rate": args.max_error_rate,
+        "max_global_error_rate": args.max_global_error_rate,
+        "over_saturation": args.over_saturation,
+        "console": console,
+    }
+    loader_kwargs = {
+        "data": args.data,
+        "model": model,
+        "data_args": args.data_args,
+        "data_samples": args.data_samples,
+        "processor": processor,
+        "processor_args": args.processor_args,
+        "data_column_mapper": args.data_column_mapper,
+        "data_preprocessors": args.data_preprocessors,
+        "data_preprocessors_kwargs": args.data_preprocessors_kwargs,
+        "data_finalizer": args.data_finalizer,
+        "data_collator": args.data_collator,
+        "data_sampler": args.data_sampler,
+        "data_num_workers": args.data_num_workers,
+        "random_seed": args.random_seed,
+        "console": console,
+    }
+
+    # For replay profile: resolve profile first to apply max_seconds filtering,
+    # then use the filtered count for the data loader. This ensures the data
+    # loader and scheduler both work with the same filtered request count.
+    if args.profile == "replay":
+        profile = await resolve_profile(**profile_kwargs, data=args.data)  # type: ignore[arg-type]
+        effective_max_requests = (
+            profile.constraints.get("max_requests")
+            if profile.constraints
+            else args.max_requests
+        )
+        request_loader = await resolve_request_loader(
+            **loader_kwargs,  # type: ignore[arg-type,misc]
+            max_requests=effective_max_requests,  # type: ignore[arg-type]
+        )
+    else:
+        request_loader = await resolve_request_loader(
+            **loader_kwargs,  # type: ignore[arg-type,misc]
+            max_requests=args.max_requests,  # type: ignore[arg-type]
+        )
+        profile = await resolve_profile(**profile_kwargs, data=None)  # type: ignore[arg-type]
 
     warmup = TransientPhaseConfig.create_from_value(args.warmup)
     cooldown = TransientPhaseConfig.create_from_value(args.cooldown)
@@ -529,21 +581,6 @@ async def benchmark_generative_text(
             ),
             status="success",
         )
-
-    profile = await resolve_profile(
-        profile=args.profile,
-        rate=args.rate,
-        random_seed=args.random_seed,
-        rampup=args.rampup,
-        constraints=constraints,
-        max_seconds=args.max_seconds,
-        max_requests=args.max_requests,
-        max_errors=args.max_errors,
-        max_error_rate=args.max_error_rate,
-        max_global_error_rate=args.max_global_error_rate,
-        over_saturation=args.over_saturation,
-        console=console,
-    )
     output_formats = await resolve_output_formats(
         outputs=args.outputs, output_dir=args.output_dir, console=console
     )

diff --git a/src/guidellm/benchmark/profiles.py b/src/guidellm/benchmark/profiles.py
@@ -13,6 +13,7 @@
 
 from abc import ABC, abstractmethod
 from collections.abc import Generator
+from pathlib import Path
 from typing import TYPE_CHECKING, Annotated, Any, ClassVar, Literal
 
 import numpy as np
@@ -37,6 +38,8 @@
     SchedulingStrategy,
     SynchronousStrategy,
     ThroughputStrategy,
+    TraceReplayStrategy,
+    load_relative_timestamps,
 )
 from guidellm.schemas import PydanticClassRegistryMixin
 
@@ -48,13 +51,14 @@
     "ConcurrentProfile",
     "Profile",
     "ProfileType",
+    "ReplayProfile",
     "SweepProfile",
     "SynchronousProfile",
     "ThroughputProfile",
 ]
 
 ProfileType = Annotated[
-    Literal["synchronous", "concurrent", "throughput", "async", "sweep"],
+    Literal["synchronous", "concurrent", "throughput", "async", "sweep", "replay"],
     "Profile type identifiers for polymorphic deserialization",
 ]
 
@@ -328,6 +332,122 @@ def next_strategy(
         return SynchronousStrategy()
 
 
+@Profile.register("replay")
+class ReplayProfile(Profile):
+    """
+    Replay a trace file:
+    schedule each request at start_time + time_scale * relative_timestamp[i].
+
+    For this profile, the ``rate`` argument is interpreted as time_scale (scale factor
+    applied to relative timestamps), not as requests per second.
+
+    When ``constraints["max_requests"]`` is set, the trace is truncated at load time:
+    only the first max_requests rows are loaded from the file for both timestamps (here)
+    and request data (in the data loader). This keeps timestamps and requests aligned.
+    The trace file is read twice: once by the data pipeline for request payloads, and
+    once here for relative timestamps.
+    """
+
+    type_: Literal["replay"] = "replay"  # type: ignore[assignment]
+    relative_timestamps: list[float] = Field(
+        description="Request start times relative to first event (first = 0)",
+    )
+    time_scale: float = Field(
+        default=1.0,
+        gt=0,
+        description="Scale factor applied to relative timestamps",
+    )
+    max_seconds_filter: float | None = Field(
+        default=None,
+        description=(
+            "Original max_seconds value used as a load-time filter "
+            "(not a runtime constraint)"
+        ),
+    )
+
+    @classmethod
+    def resolve_args(
+        cls,
+        rate_type: str,
+        rate: list[float] | None,
+        random_seed: int,
+        **kwargs: Any,
+    ) -> dict[str, Any]:
+        _ = (rate_type, random_seed)  # unused
+        data = kwargs.get("data")
+        if not data or not data[0]:
+            raise ValueError("Replay profile requires data (path to trace file)")
+        path = Path(data[0]) if isinstance(data[0], str) else data[0]
+        if not path.exists():
+            raise ValueError(f"Replay trace file not found: {path}")
+        constraints = kwargs.get("constraints") or {}
+        max_requests = constraints.get("max_requests")
+        if max_requests is not None and max_requests < 1:
+            raise ValueError(
+                "max_requests must be >= 1 when set for replay profile, "
+                f"got {max_requests}"
+            )
+
+        # For replay profile, rate is interpreted as time_scale (not requests per
+        # second)
+        time_scale = rate[0] if rate and len(rate) > 0 else 1.0
+
+        # Load all timestamps first (max_requests applied after max_seconds filtering)
+        relative_timestamps = load_relative_timestamps(path)
+
+        # Filter by max_seconds (applied in simulated time via time_scale)
+        max_seconds = constraints.get("max_seconds")
+        if max_seconds is not None and max_seconds > 0:
+            relative_timestamps = [
+                ts for ts in relative_timestamps if ts * time_scale <= max_seconds
+            ]
+
+        # Truncate by max_requests on top of any max_seconds filtering
+        if max_requests is not None:
+            relative_timestamps = relative_timestamps[:max_requests]
+
+        if not relative_timestamps:
+            raise ValueError(
+                "No timestamps remain after applying max_seconds and max_requests "
+                "filters. The trace is empty or all events were filtered out."
+            )
+
+        # Set max_requests to the actual count after filtering to prevent benchmark hang
+        # and eliminate race conditions between request completion and injection.
+        constraints["max_requests"] = len(relative_timestamps)
+
+        # Remove max_seconds to avoid runtime MaxDurationConstraint canceling
+        # in-flight requests
+        constraints.pop("max_seconds", None)
+
+        return {
+            "relative_timestamps": relative_timestamps,
+            "time_scale": time_scale,
+            "constraints": constraints,
+            "max_seconds_filter": max_seconds
+            if max_seconds and max_seconds > 0
+            else None,
+        }
+
+    @property
+    def strategy_types(self) -> list[str]:
+        return ["trace"]
+
+    def next_strategy(
+        self,
+        prev_strategy: SchedulingStrategy | None,
+        prev_benchmark: Benchmark | None,
+    ) -> TraceReplayStrategy | None:
+        _ = prev_benchmark
+        # Replay has a single strategy; return it once, then None
+        if prev_strategy is not None:
+            return None
+        return TraceReplayStrategy(
+            relative_timestamps=self.relative_timestamps,
+            time_scale=self.time_scale,
+        )
+
+
 @Profile.register("concurrent")
 class ConcurrentProfile(Profile):
     """

diff --git a/src/guidellm/data/deserializers/__init__.py b/src/guidellm/data/deserializers/__init__.py
@@ -25,6 +25,7 @@
     SyntheticTextDataset,
     SyntheticTextDatasetDeserializer,
 )
+from .trace_synthetic import TraceSyntheticDatasetDeserializer
 
 __all__ = [
     "ArrowFileDatasetDeserializer",
@@ -46,4 +47,5 @@
     "SyntheticTextDatasetDeserializer",
     "TarFileDatasetDeserializer",
     "TextFileDatasetDeserializer",
+    "TraceSyntheticDatasetDeserializer",
 ]