ai-dynamo
diff --git a/‎src/aiperf/common/bootstrap.py‎
Lines changed: 11 additions & 0 deletions b/‎src/aiperf/common/bootstrap.py‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎src/aiperf/common/models/dataset_models.py‎
Lines changed: 40 additions & 0 deletions b/‎src/aiperf/common/models/dataset_models.py‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎src/aiperf/common/models/record_models.py‎
Lines changed: 5 additions & 4 deletions b/‎src/aiperf/common/models/record_models.py‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎src/aiperf/dataset/dataset_manager.py‎
Lines changed: 61 additions & 25 deletions b/‎src/aiperf/dataset/dataset_manager.py‎
Lines changed: 61 additions & 25 deletions
diff --git a/‎src/aiperf/records/inference_result_parser.py‎
Lines changed: 16 additions & 13 deletions b/‎src/aiperf/records/inference_result_parser.py‎
Lines changed: 16 additions & 13 deletions
diff --git a/‎src/aiperf/timing/manager.py‎
Lines changed: 0 additions & 18 deletions b/‎src/aiperf/timing/manager.py‎
Lines changed: 0 additions & 18 deletions
diff --git a/‎src/aiperf/workers/inference_client.py‎
Lines changed: 6 additions & 2 deletions b/‎src/aiperf/workers/inference_client.py‎
Lines changed: 6 additions & 2 deletions
@@ -79,6 +79,17 @@ async def _run_service():
 
         ensure_modules_loaded()
 
+        if service_class.__name__ in ("Worker", "TimingManager"):
+            # Disable garbage collection in child processes to prevent unpredictable latency spikes.
+            # Only required in timing critical services such as Worker and TimingManager.
+            import gc
+
+            for _ in range(3):  # Run 3 times to ensure all objects are collected
+                gc.collect()
+            gc.freeze()
+            gc.set_threshold(0)
+            gc.disable()
+
         # Load and apply custom GPU metrics in child process
         if user_config.gpu_telemetry_metrics_file:
             from aiperf.gpu_telemetry import constants
 
@@ -146,6 +146,46 @@ def metadata(self) -> TurnMetadata:
             delay_ms=self.delay,
         )
 
+    def copy_with_stripped_media(self) -> "Turn":
+        """Create a copy of this turn with multimodal data replaced by placeholders.
+
+        This preserves text data (needed for tokenization) but replaces potentially
+        large image/audio/video contents with small placeholder strings. This is
+        more efficient than a full deep copy followed by stripping.
+
+        Returns:
+            A new Turn with stripped multimodal contents.
+        """
+        return Turn(
+            model=self.model,
+            role=self.role,
+            timestamp=self.timestamp,
+            delay=self.delay,
+            max_tokens=self.max_tokens,
+            texts=[Text(name=t.name, contents=list(t.contents)) for t in self.texts],
+            images=[
+                Image(
+                    name=img.name,
+                    contents=[f"image_{i}" for i in range(len(img.contents))],
+                )
+                for img in self.images
+            ],
+            audios=[
+                Audio(
+                    name=aud.name,
+                    contents=[f"audio_{i}" for i in range(len(aud.contents))],
+                )
+                for aud in self.audios
+            ],
+            videos=[
+                Video(
+                    name=vid.name,
+                    contents=[f"video_{i}" for i in range(len(vid.contents))],
+                )
+                for vid in self.videos
+            ],
+        )
+
 
 class ConversationMetadata(AIPerfBaseModel):
     """Metadata of a conversation."""
 
@@ -447,10 +447,6 @@ class RequestRecord(AIPerfBaseModel):
         default=None,
         description="The original request info.",
     )
-    turns: list[Turn] = Field(
-        default_factory=list,
-        description="The actual turns of the request. This will include assistant turns as well as user turns in multi-turn conversations.",
-    )
     request_headers: dict[str, str] | None = Field(
         default=None,
         description="The headers of the request.",
@@ -510,6 +506,11 @@ class RequestRecord(AIPerfBaseModel):
         "Includes detailed timing for connection establishment, DNS resolution, request/response events, etc. "
         "The type of the trace data is determined by the transport and library used.",
     )
+    turns: list[Turn] = Field(
+        default_factory=list,
+        description="Deep copy of the request turns. This is a copy of the turns from request_info, "
+        "made to avoid mutating the original session data when stripping multimodal content.",
+    )
 
     @field_validator("trace_data", mode="before")
     @classmethod
 
@@ -1,6 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 import asyncio
+import gc
 import time
 
 import orjson
@@ -22,6 +23,7 @@
 from aiperf.common.factories import (
     ComposerFactory,
     DatasetBackingStoreFactory,
+    DatasetClientStoreFactory,
     EndpointFactory,
     ServiceFactory,
 )
@@ -45,6 +47,7 @@
 )
 from aiperf.common.protocols import (
     DatasetBackingStoreProtocol,
+    DatasetClientStoreProtocol,
     EndpointProtocol,
     ServiceProtocol,
 )
@@ -94,6 +97,7 @@ def __init__(
                 benchmark_id=user_config.benchmark_id,
             )
         )
+        self._dataset_client: DatasetClientStoreProtocol | None = None
 
     @on_command(CommandType.PROFILE_CONFIGURE)
     async def _profile_configure_command(
@@ -111,9 +115,33 @@ async def _profile_configure_command(
         begin = time.perf_counter()
         await self._configure_dataset()
         await self._generate_inputs_json_file()
+        await self._configure_dataset_client_and_free_memory()
+
         duration = time.perf_counter() - begin
         self.info(lambda: f"Dataset configured in {duration:.2f} seconds")
 
+    async def _configure_dataset_client_and_free_memory(self) -> None:
+        """Configure the dataset client for serving fallback requests."""
+        # Create dataset client for serving fallback requests, then free in-memory dataset
+        client_metadata = self._backing_store.get_client_metadata()
+        self._dataset_client = DatasetClientStoreFactory.create_instance(
+            client_metadata=client_metadata,
+        )
+        await self._dataset_client.initialize()
+        # Now that the client is ready, signal that fallback requests can be served
+        self.dataset_configured.set()
+        # Free the in-memory dataset now that we have the client to serve fallback requests.
+        # Reassign to new empty containers (not .clear()) to release object references,
+        # then run gc.collect() twice to ensure circular references are cleaned up.
+        conversation_count = len(self.dataset)
+        self.dataset = {}
+        self._conversation_ids_cache = []
+        gc.collect()
+        gc.collect()
+        self.info(
+            f"Dataset client initialized and freed {conversation_count} conversations from memory"
+        )
+
     async def _configure_tokenizer(self) -> None:
         """Configure the tokenizer for the dataset manager."""
         tokenizer_name = self.user_config.tokenizer.name
@@ -304,7 +332,9 @@ async def _configure_dataset(self) -> None:
             f"unique conversations: {len(self.dataset_metadata.conversations)}, "
             f"unique turn count: {self.dataset_metadata.total_turn_count}"
         )
-        self.dataset_configured.set()
+        # Note: dataset_configured event is set in _profile_configure_command after
+        # the dataset client is initialized, to avoid a race condition where fallback
+        # requests arrive before the client is ready.
         await self.publish(
             DatasetConfiguredNotification(
                 service_id=self.service_id,
@@ -317,55 +347,58 @@ async def _configure_dataset(self) -> None:
     async def _handle_conversation_request(
         self, message: ConversationRequestMessage
     ) -> ConversationResponseMessage:
-        """Handle a conversation request."""
+        """Handle a conversation request using the dataset client."""
         self.debug(lambda: f"Handling conversation request: {message}")
 
         await self._wait_for_dataset_configuration()
 
-        if not self.dataset:
+        if self._dataset_client is None:
             raise self._service_error(
-                "Dataset is empty and must be configured before handling requests.",
+                "Dataset client is not initialized. Dataset must be configured before handling requests.",
             )
 
-        return self._return_conversation_by_id(
-            request_id=message.request_id,
-            conversation_id=message.conversation_id,
-        )
-
-    def _return_conversation_by_id(
-        self, request_id: str | None, conversation_id: str
-    ) -> ConversationResponseMessage:
-        """Return a conversation if it exists, otherwise raise an error."""
-
-        if conversation_id not in self.dataset:
-            raise self._service_error(
-                f"Conversation {conversation_id} not found in dataset.",
+        try:
+            conversation = await self._dataset_client.get_conversation(
+                message.conversation_id
             )
+        except KeyError:
+            raise self._service_error(
+                f"Conversation {message.conversation_id} not found in dataset.",
+            ) from None
 
-        conversation = self.dataset[conversation_id]
         self.trace_or_debug(
             lambda: f"Sending conversation response: {conversation}",
             lambda: f"Sending conversation response with id: {conversation.session_id}",
         )
         return ConversationResponseMessage(
             service_id=self.service_id,
-            request_id=request_id,
+            request_id=message.request_id,
             conversation=conversation,
         )
 
     @on_request(MessageType.CONVERSATION_TURN_REQUEST)
     async def _handle_conversation_turn_request(
         self, message: ConversationTurnRequestMessage
     ) -> ConversationTurnResponseMessage:
-        """Handle a turn request."""
+        """Handle a turn request using the dataset client."""
         self.debug(lambda: f"Handling turn request: {message}")
 
-        if message.conversation_id not in self.dataset:
+        await self._wait_for_dataset_configuration()
+
+        if self._dataset_client is None:
             raise self._service_error(
-                f"Conversation {message.conversation_id} not found in dataset.",
+                "Dataset client is not initialized. Dataset must be configured before handling requests.",
+            )
+
+        try:
+            conversation = await self._dataset_client.get_conversation(
+                message.conversation_id
             )
+        except KeyError as e:
+            raise self._service_error(
+                f"Conversation {message.conversation_id} not found in dataset.",
+            ) from e
 
-        conversation = self.dataset[message.conversation_id]
         if message.turn_index >= len(conversation.turns):
             raise self._service_error(
                 f"Turn index {message.turn_index} is out of range for conversation {message.conversation_id}.",
@@ -395,8 +428,11 @@ async def _wait_for_dataset_configuration(self) -> None:
             )
 
     @on_stop
-    async def _cleanup_backing_store(self) -> None:
-        """Clean up the backing store and associated mmap files."""
+    async def _cleanup(self) -> None:
+        """Clean up the backing store, dataset client, and associated mmap files."""
+        if self._dataset_client is not None:
+            await self._dataset_client.stop()
+            self.debug("Dataset client cleanup complete")
         if self._backing_store is not None:
             await self._backing_store.stop()
             self.debug("Backing store cleanup complete")
 
@@ -230,29 +230,31 @@ async def compute_input_token_count(
             return None
 
         tokenizer = await self.get_tokenizer(request_record.model_name)
-        input_token_count = 0
+        prompt_texts: list[str] = []
 
         # Include system_message if present (shared system prompt)
         if request_record.request_info and request_record.request_info.system_message:
-            input_token_count += len(
-                tokenizer.encode(request_record.request_info.system_message)
-            )
+            prompt_texts.append(request_record.request_info.system_message)
 
         # Include user_context_message if present (per-conversation user context)
         if (
             request_record.request_info
             and request_record.request_info.user_context_message
         ):
-            input_token_count += len(
-                tokenizer.encode(request_record.request_info.user_context_message)
-            )
+            prompt_texts.append(request_record.request_info.user_context_message)
 
         # Include all turns' text content
-        # TODO: We need to handle images, audios, videos, etc.
         for turn in turns:
             for text in turn.texts:
-                input_token_count += len(tokenizer.encode("".join(text.contents)))
-        return input_token_count
+                prompt_texts.append("".join(text.contents))
+
+        if not prompt_texts:
+            return None
+
+        # NOTE: We combine all the prompt texts with a space separator to create a single prompt string.
+        # This will get us the most accurate token count for the prompt by avoiding any potential
+        # boundary issues that could occur if we were to tokenize each text individually.
+        return self._compute_token_count(tokenizer, prompt_texts, separator=" ")
 
     async def _compute_server_token_counts(
         self, responses: list[ParsedResponse]
@@ -317,20 +319,21 @@ def _parse_output_and_reasoning_texts(
         return output_texts, reasoning_texts
 
     def _compute_token_count(
-        self, tokenizer: Tokenizer, texts: list[str]
+        self, tokenizer: Tokenizer, texts: list[str], separator: str = ""
     ) -> int | None:
-        """Compute the number of tokens in the texts by joining them without any separators and encoding with the tokenizer.
+        """Compute the number of tokens in the texts by joining them with an optional separator (default none) and encoding with the tokenizer.
 
         Args:
             tokenizer: The tokenizer to use
             texts: List of texts to compute the token count for
+            separator: The separator to use between the texts
 
         Returns:
             The number of tokens in the texts, or None if the texts are empty
         """
         if not texts:
             return None
-        return len(tokenizer.encode("".join(texts)))
+        return len(tokenizer.encode(separator.join(texts)))
 
     async def _compute_client_side_token_counts(
         self, request_record: RequestRecord, responses: list[ParsedResponse]
 
@@ -3,7 +3,6 @@
 
 
 import asyncio
-import gc
 
 from aiperf.common.base_component_service import BaseComponentService
 from aiperf.common.config import ServiceConfig, UserConfig
@@ -129,13 +128,6 @@ async def _on_start_profiling(self, _message: CommandMessage) -> None:
         if not self._phase_orchestrator:
             raise InvalidStateError("No phase orchestrator configured")
 
-        # Disable GC during profiling to eliminate unpredictable latency spikes.
-        # Collect and freeze first to minimize memory pressure during the benchmark.
-        self.debug("Disabling garbage collection for stable timing")
-        gc.collect()
-        gc.freeze()
-        gc.disable()
-
         # Start event loop health monitoring only during the benchmark
         self.event_loop_monitor.start()
 
@@ -164,16 +156,6 @@ async def _timing_manager_stop(self) -> None:
             await self._phase_orchestrator.stop()
 
         self.event_loop_monitor.stop()
-        self._re_enable_gc()
-
-    def _re_enable_gc(self) -> None:
-        """Re-enable garbage collection."""
-        self.debug(
-            "Re-enabling garbage collection to allow the timing manager "
-            "to clean up resources"
-        )
-        gc.unfreeze()
-        gc.enable()
 
 
 def main() -> None:
 
@@ -107,15 +107,13 @@ async def _send_request_internal(
                 self.debug(
                     f"pre_send_perf_ns to start_perf_ns latency: {result.start_perf_ns - pre_send_perf_ns} ns"
                 )
-            result.turns = request_info.turns
             return result
         except Exception as e:
             self.error(
                 f"Error calling inference server API at {self.model_endpoint.endpoint.base_url}: {e!r}"
             )
             return RequestRecord(
                 request_info=request_info,
-                turns=request_info.turns,
                 timestamp_ns=pre_send_timestamp_ns or time.time_ns(),
                 # Try and use the pre_send_perf_ns if it is available, otherwise use the current time.
                 start_perf_ns=pre_send_perf_ns or time.perf_counter_ns(),
@@ -156,11 +154,17 @@ def _enrich_request_record(
             or self.model_endpoint.primary_model_name
         )
         record.request_info = request_info
+
+        # Copy turns with stripped multimodal data to avoid mutating original session
+        # and reduce memory usage (placeholders instead of large image/audio/video data)
+        record.turns = [turn.copy_with_stripped_media() for turn in request_info.turns]
+
         # If this is the first turn, calculate the credit drop latency
         if request_info.turn_index == 0 and request_info.drop_perf_ns is not None:
             record.credit_drop_latency = (
                 record.start_perf_ns - request_info.drop_perf_ns
             )
+
         # Preserve headers set by transport; only use endpoint headers if not set
         if record.request_headers is None:
             record.request_headers = request_info.endpoint_headers