vllm-project
diff --git a/‎src/guidellm/benchmark/entrypoints.py‎
Lines changed: 68 additions & 33 deletions b/‎src/guidellm/benchmark/entrypoints.py‎
Lines changed: 68 additions & 33 deletions
diff --git a/‎src/guidellm/benchmark/profiles.py‎
Lines changed: 120 additions & 1 deletion b/‎src/guidellm/benchmark/profiles.py‎
Lines changed: 120 additions & 1 deletion
diff --git a/‎src/guidellm/data/deserializers/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎src/guidellm/data/deserializers/__init__.py‎
Lines changed: 2 additions & 0 deletions
@@ -247,6 +247,7 @@ async def resolve_request_loader(
     data_num_workers: int | None,
     random_seed: int,
     console: Console | None = None,
+    max_requests: int | None = None,
     **dataloader_kwargs: dict[str, Any] | None,
 ) -> DataLoader[GenerationRequest]:
     """
@@ -270,6 +271,7 @@ async def resolve_request_loader(
     :param data_num_workers: Number of worker processes for data loading
     :param random_seed: Seed for reproducible random operations
     :param console: Console instance for progress reporting, or None
+    :param max_requests: If set, first data source loads at most this many rows.
     :param dataloader_kwargs: Additional arguments passed to DataLoader initialization
     :return: Configured DataLoader instance for GenerationRequest objects
     :raises ValueError: If request formatter type is not registered in
@@ -306,6 +308,17 @@ async def resolve_request_loader(
         data_finalizer,
     )
 
+    # When max_requests is set, limit the first data source to that many rows at load
+    if max_requests is not None and data:
+        if max_requests < 1:
+            raise ValueError(
+                "max_requests must be >= 1 when set for data truncation, "
+                f"got {max_requests}"
+            )
+        data_args = list(data_args) if data_args else [{} for _ in data]
+        if len(data_args) >= 1:
+            data_args[0] = {**data_args[0], "max_rows": max_requests}
+
     request_loader: DataLoader[GenerationRequest] = DataLoader(
         data=data,
         data_args=data_args,
@@ -352,6 +365,7 @@ async def resolve_profile(
     max_global_error_rate: float | None,
     over_saturation: dict[str, Any] | None = None,
     console: Console | None = None,
+    data: list[Any] | None = None,
 ) -> Profile:
     """
     Resolve and configure a benchmark profile with rate and constraint settings.
@@ -373,6 +387,7 @@ async def resolve_profile(
     :param max_global_error_rate: Maximum global error rate threshold before stopping
     :param over_saturation: Over-saturation detection configuration (dict)
     :param console: Console instance for progress reporting, or None
+    :param data: Optional list of data sources.
     :return: Configured Profile instance ready for benchmarking
     :raises ValueError: If constraints are provided with a pre-configured Profile
     """
@@ -400,6 +415,7 @@ async def resolve_profile(
             random_seed=random_seed,
             rampup_duration=rampup,
             constraints={**constraints},
+            data=data,
         )
     elif constraints:
         raise ValueError(
@@ -488,24 +504,58 @@ async def benchmark_generative_text(
     processor = await resolve_processor(
         processor=args.processor, model=model, console=console
     )
-    request_loader = await resolve_request_loader(
-        data=args.data,
-        model=model,
-        data_args=args.data_args,
-        data_samples=args.data_samples,
-        processor=processor,
-        processor_args=args.processor_args,
-        data_column_mapper=args.data_column_mapper,
-        data_preprocessors=args.data_preprocessors,
-        data_preprocessors_kwargs=args.data_preprocessors_kwargs,
-        data_finalizer=args.data_finalizer,
-        data_collator=args.data_collator,
-        data_sampler=args.data_sampler,
-        data_num_workers=args.data_num_workers,
-        random_seed=args.random_seed,
-        console=console,
-        **(args.dataloader_kwargs or {}),
-    )
+
+    # Build common kwargs for resolve_profile and resolve_request_loader
+    profile_kwargs = {
+        "profile": args.profile,
+        "rate": args.rate,
+        "random_seed": args.random_seed,
+        "rampup": args.rampup,
+        "constraints": constraints,
+        "max_seconds": args.max_seconds,
+        "max_requests": args.max_requests,
+        "max_errors": args.max_errors,
+        "max_error_rate": args.max_error_rate,
+        "max_global_error_rate": args.max_global_error_rate,
+        "over_saturation": args.over_saturation,
+        "console": console,
+    }
+    loader_kwargs = {
+        "data": args.data,
+        "model": model,
+        "data_args": args.data_args,
+        "data_samples": args.data_samples,
+        "processor": processor,
+        "processor_args": args.processor_args,
+        "data_column_mapper": args.data_column_mapper,
+        "data_preprocessors": args.data_preprocessors,
+        "data_preprocessors_kwargs": args.data_preprocessors_kwargs,
+        "data_finalizer": args.data_finalizer,
+        "data_collator": args.data_collator,
+        "data_sampler": args.data_sampler,
+        "data_num_workers": args.data_num_workers,
+        "random_seed": args.random_seed,
+        "console": console,
+    }
+
+    # For replay profile: resolve profile first to apply max_seconds filtering,
+    # then use the filtered count for the data loader. This ensures the data
+    # loader and scheduler both work with the same filtered request count.
+    if args.profile == "replay":
+        profile = await resolve_profile(**profile_kwargs, data=args.data)  # type: ignore[arg-type]
+        effective_max_requests = (
+            profile.constraints.get("max_requests")
+            if profile.constraints
+            else args.max_requests
+        )
+        request_loader = await resolve_request_loader(
+            **loader_kwargs, max_requests=effective_max_requests
+        )  # type: ignore[arg-type]
+    else:
+        request_loader = await resolve_request_loader(
+            **loader_kwargs, max_requests=args.max_requests
+        )  # type: ignore[arg-type]
+        profile = await resolve_profile(**profile_kwargs, data=None)  # type: ignore[arg-type]
 
     warmup = TransientPhaseConfig.create_from_value(args.warmup)
     cooldown = TransientPhaseConfig.create_from_value(args.cooldown)
@@ -521,21 +571,6 @@ async def benchmark_generative_text(
             ),
             status="success",
         )
-
-    profile = await resolve_profile(
-        profile=args.profile,
-        rate=args.rate,
-        random_seed=args.random_seed,
-        rampup=args.rampup,
-        constraints=constraints,
-        max_seconds=args.max_seconds,
-        max_requests=args.max_requests,
-        max_errors=args.max_errors,
-        max_error_rate=args.max_error_rate,
-        max_global_error_rate=args.max_global_error_rate,
-        over_saturation=args.over_saturation,
-        console=console,
-    )
     output_formats = await resolve_output_formats(
         outputs=args.outputs, output_dir=args.output_dir, console=console
     )
 
@@ -13,6 +13,7 @@
 
 from abc import ABC, abstractmethod
 from collections.abc import Generator
+from pathlib import Path
 from typing import TYPE_CHECKING, Annotated, Any, ClassVar, Literal
 
 import numpy as np
@@ -37,6 +38,8 @@
     SchedulingStrategy,
     SynchronousStrategy,
     ThroughputStrategy,
+    TraceReplayStrategy,
+    load_relative_timestamps,
 )
 from guidellm.schemas import PydanticClassRegistryMixin
 
@@ -48,13 +51,14 @@
     "ConcurrentProfile",
     "Profile",
     "ProfileType",
+    "ReplayProfile",
     "SweepProfile",
     "SynchronousProfile",
     "ThroughputProfile",
 ]
 
 ProfileType = Annotated[
-    Literal["synchronous", "concurrent", "throughput", "async", "sweep"],
+    Literal["synchronous", "concurrent", "throughput", "async", "sweep", "replay"],
     "Profile type identifiers for polymorphic deserialization",
 ]
 
@@ -328,6 +332,121 @@ def next_strategy(
         return SynchronousStrategy()
 
 
+@Profile.register("replay")
+class ReplayProfile(Profile):
+    """
+    Replay a trace file:
+    schedule each request at start_time + time_scale * relative_timestamp[i].
+
+    For this profile, the ``rate`` argument is interpreted as time_scale (scale factor
+    applied to relative timestamps), not as requests per second.
+
+    When ``constraints["max_requests"]`` is set, the trace is truncated at load time:
+    only the first max_requests rows are loaded from the file for both timestamps (here)
+    and request data (in the data loader). This keeps timestamps and requests aligned.
+    The trace file is read twice: once by the data pipeline for request payloads, and
+    once here for relative timestamps.
+    """
+
+    type_: Literal["replay"] = "replay"  # type: ignore[assignment]
+    relative_timestamps: list[float] = Field(
+        description="Request start times relative to first event (first = 0)",
+    )
+    time_scale: float = Field(
+        default=1.0,
+        gt=0,
+        description="Scale factor applied to relative timestamps",
+    )
+    max_seconds_filter: float | None = Field(
+        default=None,
+        description=(
+            "Original max_seconds value used as a load-time filter "
+            "(not a runtime constraint)"
+        ),
+    )
+
+    @classmethod
+    def resolve_args(
+        cls,
+        rate_type: str,
+        rate: list[float] | None,
+        random_seed: int,
+        **kwargs: Any,
+    ) -> dict[str, Any]:
+        _ = (rate_type, random_seed)  # unused
+        data = kwargs.get("data")
+        if not data or not data[0]:
+            raise ValueError("Replay profile requires data (path to trace file)")
+        path = Path(data[0]) if isinstance(data[0], str) else data[0]
+        if not path.exists():
+            raise ValueError(f"Replay trace file not found: {path}")
+        constraints = kwargs.get("constraints") or {}
+        max_requests = constraints.get("max_requests")
+        if max_requests is not None and max_requests < 1:
+            raise ValueError(
+                "max_requests must be >= 1 when set for replay profile, "
+                f"got {max_requests}"
+            )
+
+        # For replay profile, rate is interpreted as time_scale (not requests per
+        # second)
+        time_scale = rate[0] if rate and len(rate) > 0 else 1.0
+
+        # Load all timestamps first (max_requests applied after max_seconds filtering)
+        relative_timestamps = load_relative_timestamps(path)
+
+        # Filter by max_seconds (applied in simulated time via time_scale)
+        max_seconds = constraints.get("max_seconds")
+        if max_seconds is not None and max_seconds > 0:
+            relative_timestamps = [
+                ts for ts in relative_timestamps if ts * time_scale <= max_seconds
+            ]
+
+        # Truncate by max_requests on top of any max_seconds filtering
+        if max_requests is not None:
+            relative_timestamps = relative_timestamps[:max_requests]
+
+        if not relative_timestamps:
+            raise ValueError(
+                "No timestamps remain after applying max_seconds and max_requests "
+                "filters. The trace is empty or all events were filtered out."
+            )
+
+        # Set max_requests to the actual count after filtering to prevent benchmark hang
+        # and eliminate race conditions between request completion and injection.
+        constraints["max_requests"] = len(relative_timestamps)
+
+        # Remove max_seconds to avoid runtime MaxDurationConstraint canceling
+        # in-flight requests
+        constraints.pop("max_seconds", None)
+
+        return {
+            "relative_timestamps": relative_timestamps,
+            "time_scale": time_scale,
+            "constraints": constraints,
+            "max_seconds_filter": max_seconds if max_seconds and max_seconds > 0
+            else None,
+        }
+
+    @property
+    def strategy_types(self) -> list[str]:
+        return ["trace"]
+
+    def next_strategy(
+        self,
+        prev_strategy: SchedulingStrategy | None,
+        prev_benchmark: Benchmark | None,
+    ) -> TraceReplayStrategy | None:
+        _ = prev_benchmark
+        # Replay has a single strategy; return it once, then None
+        if prev_strategy is not None:
+            return None
+        return TraceReplayStrategy(
+            relative_timestamps=self.relative_timestamps,
+            time_scale=self.time_scale,
+        )
+
+
 @Profile.register("concurrent")
 class ConcurrentProfile(Profile):
     """
 
@@ -25,6 +25,7 @@
     SyntheticTextDataset,
     SyntheticTextDatasetDeserializer,
 )
+from .trace_synthetic import TraceSyntheticDatasetDeserializer
 
 __all__ = [
     "ArrowFileDatasetDeserializer",
@@ -46,4 +47,5 @@
     "SyntheticTextDatasetDeserializer",
     "TarFileDatasetDeserializer",
     "TextFileDatasetDeserializer",
+    "TraceSyntheticDatasetDeserializer",
 ]
Original file line number	Diff line number	Diff line change
`@@ -25,6 +25,7 @@`
`25`	`25`	`SyntheticTextDataset,`
`26`	`26`	`SyntheticTextDatasetDeserializer,`
`27`	`27`	`)`
	`28`	`+from .trace_synthetic import TraceSyntheticDatasetDeserializer`
`28`	`29`
`29`	`30`	`__all__ = [`
`30`	`31`	`"ArrowFileDatasetDeserializer",`
`@@ -46,4 +47,5 @@`
`46`	`47`	`"SyntheticTextDatasetDeserializer",`
`47`	`48`	`"TarFileDatasetDeserializer",`
`48`	`49`	`"TextFileDatasetDeserializer",`
	`50`	`+ "TraceSyntheticDatasetDeserializer",`
`49`	`51`	`]`