feat: Add Chat API + Multi-turn support

bongwoobak · bongwoobak · commit 0f0012a4ab7c · 2025-12-12T02:27:39.000+09:00
- Implement UserSessionChatAPIData for Chat API with multi-turn chat
- Update SharedPrefixDataGenerator to support Chat API + Multi-turn combination
- Fix prompt selection to use different questions for each turn
- Initialize user session context as list for Chat API compatibility
diff --git a/inference_perf/apis/chat.py b/inference_perf/apis/chat.py
@@ -31,6 +31,7 @@ class ChatMessage(BaseModel):
 class ChatCompletionAPIData(InferenceAPIData):
     messages: List[ChatMessage]
     max_tokens: int = 0
+    model_response: str = ""  # Store the assistant response for multi-turn chat
 
     def get_api_type(self) -> APIType:
         return APIType.Chat
@@ -81,6 +82,7 @@ async def process_response(self, response: ClientResponse, config: APIConfig, to
             prompt_text = "".join([msg.content for msg in self.messages if msg.content])
             prompt_len = tokenizer.count_tokens(prompt_text)
             output_len = tokenizer.count_tokens(output_text)
+            self.model_response = output_text  # Store response for multi-turn chat
             return InferenceInfo(
                 input_tokens=prompt_len,
                 output_tokens=output_len,
@@ -93,6 +95,7 @@ async def process_response(self, response: ClientResponse, config: APIConfig, to
             if len(choices) == 0:
                 return InferenceInfo(input_tokens=prompt_len)
             output_text = "".join([choice.get("message", {}).get("content", "") for choice in choices])
+            self.model_response = output_text  # Store response for multi-turn chat
             output_len = tokenizer.count_tokens(output_text)
             return InferenceInfo(
                 input_tokens=prompt_len,
diff --git a/inference_perf/apis/user_session.py b/inference_perf/apis/user_session.py
@@ -7,22 +7,24 @@
 from inference_perf.apis import CompletionAPIData, InferenceInfo
 from inference_perf.utils.custom_tokenizer import CustomTokenizer
 from inference_perf.config import APIConfig
+from inference_perf.apis.chat import ChatCompletionAPIData, ChatMessage
 
 logger = logging.getLogger(__name__)
 
 
 class LocalUserSession:
     user_session_id: str
-    context: str
+    user_session_id: str
+    context: Any
 
-    def __init__(self, user_session_id: str, context: str = ""):
+    def __init__(self, user_session_id: str, context: Any = ""):
         self.user_session_id = user_session_id
         self.contexts = context if context else ""
         self._current_round = 0
         self._in_flight: asyncio.Lock = asyncio.Lock()
         self._waiting_rounds: asyncio.Queue[asyncio.Future[bool]] = asyncio.Queue()
 
-    async def get_context(self, round: int) -> str:
+    async def get_context(self, round: int) -> Any:
         if not self._waiting_rounds.empty() or self._in_flight.locked():
             # entering waiting queue
             future: asyncio.Future[bool] = asyncio.Future()
@@ -32,7 +34,7 @@ async def get_context(self, round: int) -> str:
         self._current_round += 1
         return self.contexts
 
-    def update_context(self, response: str) -> None:
+    def update_context(self, response: Any) -> None:
         self.contexts = response
 
         if not self._waiting_rounds.empty():
@@ -76,6 +78,80 @@ async def process_failure(
         return inference_info
 
 
-# TODO: UserSessionChatAPIData need to be implemented
-# class UserSessionChatAPIData(ChatCompletionAPIData):
-#     ...
+
+class UserSessionChatAPIData(ChatCompletionAPIData):
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+    user_session: LocalUserSession = Field(exclude=True)
+    target_round: int
+
+    async def to_payload(self, model_name: str, max_tokens: int, ignore_eos: bool, streaming: bool) -> dict[str, Any]:
+        self._session_context = await self.user_session.get_context(self.target_round)
+        # Append current messages to the session context (history)
+        # self.messages contains the new user message for this turn (may include system)
+        # self._session_context contains the history (system prompt + previous turns)
+        if isinstance(self._session_context, list):
+             # History already exists, append only the new user message(s)
+             # Remove system from current messages if it exists (already in history)
+             new_messages = [msg for msg in self.messages if msg.role != "system"]
+             full_messages = self._session_context + new_messages
+        else:
+            # First turn: context is not a list yet, use all messages (including system)
+            full_messages = self.messages
+
+        # We temporarily override self.messages to generate payload, then restore? 
+        # Or just construct payload manually. 
+        # ChatCompletionAPIData.to_payload uses self.messages.
+        # Let's override self.messages for the payload generation, but we need to be careful.
+        # Better to just construct the payload here similar to ChatCompletionAPIData.to_payload
+        
+        if self.max_tokens == 0:
+            self.max_tokens = max_tokens
+            
+        return {
+            "model": model_name,
+            "messages": [{"role": m.role, "content": m.content} for m in full_messages],
+            "max_tokens": self.max_tokens,
+            "ignore_eos": ignore_eos,
+            "stream": streaming,
+        }
+
+    def update_inference_info(self, inference_info: InferenceInfo) -> None:
+        inference_info.extra_info["user_session"] = self.user_session.user_session_id
+        inference_info.extra_info["chat_round"] = self.user_session._current_round
+
+    async def process_response(self, response: ClientResponse, config: APIConfig, tokenizer: CustomTokenizer) -> InferenceInfo:
+        inference_info = await super().process_response(response, config, tokenizer)
+        self.update_inference_info(inference_info)
+        
+        # Update context with the new turn
+        # History <- History + User Message + Assistant Response
+        # self._session_context is the history before this turn
+        # self.messages is the user message(s) for this turn
+        # self.model_response is the assistant response text
+        
+        new_history = []
+        if isinstance(self._session_context, list):
+            # History already exists, extend it
+            new_history.extend(self._session_context)
+            # Add only new user message(s), excluding system (already in history)
+            new_messages = [msg for msg in self.messages if msg.role != "system"]
+            new_history.extend(new_messages)
+        else:
+            # First turn: include all messages (system + user)
+            new_history.extend(self.messages)
+        
+        # Add assistant response
+        new_history.append(ChatMessage(role="assistant", content=self.model_response))
+        
+        self.user_session.update_context(new_history)
+        return inference_info
+
+    async def process_failure(
+        self, response: Optional[ClientResponse], config: APIConfig, tokenizer: CustomTokenizer, exception: Exception
+    ) -> Optional[InferenceInfo]:
+        # no response returned, use context from the last round (do not add new messages)
+        inference_info = InferenceInfo()
+        self.update_inference_info(inference_info)
+        self.user_session.update_context(self._session_context)
+        return inference_info
+
diff --git a/inference_perf/datagen/shared_prefix_datagen.py b/inference_perf/datagen/shared_prefix_datagen.py
@@ -4,7 +4,7 @@
 
 from inference_perf.apis.base import InferenceAPIData, LazyLoadInferenceAPIData
 from inference_perf.apis.completion import CompletionAPIData
-from inference_perf.apis.user_session import LocalUserSession, UserSessionCompletionAPIData
+from inference_perf.apis.user_session import LocalUserSession, UserSessionCompletionAPIData, UserSessionChatAPIData
 from inference_perf.apis.chat import ChatCompletionAPIData, ChatMessage
 from inference_perf.config import APIConfig, APIType, DataConfig
 from inference_perf.utils.custom_tokenizer import CustomTokenizer
@@ -65,25 +65,54 @@ def is_prefered_worker_requested(self) -> bool:
         return True
 
     def load_lazy_data(self, data: LazyLoadInferenceAPIData) -> InferenceAPIData:
-        i = data.data_index % len(self.prompts)
         if self.enable_multi_turn_chat:
             user_id = data.data_index % len(self.user_sessions)
             round = data.data_index // len(self.user_sessions)
-            return UserSessionCompletionAPIData(
-                prompt=self.prompts[i],
-                max_tokens=self.output_len,
-                user_session=self.user_sessions[user_id],
-                target_round=round,
-            )
-        elif self.api_config.type == APIType.Chat:
-            shared_prefix, question = self.prompt_pairs[i]
-            messages = [
-                ChatMessage(role="system", content=shared_prefix),
-                ChatMessage(role="user", content=question)
-            ]
-            return ChatCompletionAPIData(messages=messages, max_tokens=self.output_len)
+            
+            # Each user belongs to a group, and each group has num_prompts_per_group questions
+            # Calculate which group this user belongs to and which question in the group
+            group_id = user_id // self.num_prompts_per_group
+            prompt_in_group = user_id % self.num_prompts_per_group
+            
+            # For each round, use a different question from the same group
+            # Cycle through questions in the same group
+            question_in_group = (prompt_in_group + round) % self.num_prompts_per_group
+            question_idx = group_id * self.num_prompts_per_group + question_in_group
+            
+            if self.api_config.type == APIType.Chat:
+                # Chat API + Multi-turn: Use UserSessionChatAPIData
+                shared_prefix, question = self.prompt_pairs[question_idx]
+                messages = [
+                    ChatMessage(role="system", content=shared_prefix),
+                    ChatMessage(role="user", content=question)
+                ]
+                return UserSessionChatAPIData(
+                    messages=messages,
+                    max_tokens=self.output_len,
+                    user_session=self.user_sessions[user_id],
+                    target_round=round,
+                )
+            else:
+                # Completion API + Multi-turn: Use UserSessionCompletionAPIData
+                prompt_idx = question_idx
+                return UserSessionCompletionAPIData(
+                    prompt=self.prompts[prompt_idx],
+                    max_tokens=self.output_len,
+                    user_session=self.user_sessions[user_id],
+                    target_round=round,
+                )
         else:
-            return CompletionAPIData(prompt=self.prompts[i], max_tokens=self.output_len)
+            # Single-turn: use data_index directly
+            i = data.data_index % len(self.prompts)
+            if self.api_config.type == APIType.Chat:
+                shared_prefix, question = self.prompt_pairs[i]
+                messages = [
+                    ChatMessage(role="system", content=shared_prefix),
+                    ChatMessage(role="user", content=question)
+                ]
+                return ChatCompletionAPIData(messages=messages, max_tokens=self.output_len)
+            else:
+                return CompletionAPIData(prompt=self.prompts[i], max_tokens=self.output_len)
 
     def get_request(self, n: int) -> InferenceAPIData:
         i = n % len(self.prompts)
@@ -151,10 +180,19 @@ def _generate_prompts(self) -> None:
 
                 if self.enable_multi_turn_chat:
                     # multi turn chat, create user to keep conversation
+                    # For Chat API, context should be a list of messages starting with system prompt
+                    # For Completion API, context is a string
+                    if self.api_config.type == APIType.Chat:
+                        initial_context = [
+                            ChatMessage(role="system", content=shared_prefix_text)
+                        ]
+                    else:
+                        initial_context = shared_prefix_text
+                    
                     self.user_sessions.append(
                         LocalUserSession(
                             user_session_id=f"user_session_{self.num_prompts_per_group * group_id + prompt_id}",
-                            context=shared_prefix_text,
+                            context=initial_context,
                         )
                     )