fix: improve streaming proxy throughput by fixing middleware and logging bottlenecks (#21501)

ishaan-jaff · web-flow · commit 6486db364624 · 2026-02-18T16:16:49.000-08:00
* fix(middleware): replace BaseHTTPMiddleware with pure ASGI middleware

BaseHTTPMiddleware wraps streaming responses with receive_or_disconnect
per chunk, blocking the event loop and causing severe throughput
degradation under concurrent streaming load (53% of CPU in profiling).

Converts PrometheusAuthMiddleware to a pure ASGI middleware using the
__call__(scope, receive, send) protocol.

* fix(streaming): remove expensive debug logging and optimize usage stripping

- Remove print_verbose calls that format chunk/response Pydantic objects,
  triggering millions of __repr__ calls (8% of CPU in profiling)
- Guard remaining verbose_logger.debug with isEnabledFor(DEBUG) and use
  lazy %s formatting instead of f-strings
- Replace usage stripping round-trip (model_dump + delete + reconstruct)
  with a _usage_stripped flag, deferring exclusion to serialization time

* fix(proxy): remove per-chunk debug log and use _usage_stripped flag

- Remove verbose_proxy_logger.debug that formatted every streaming chunk
- Honor _usage_stripped flag from streaming handler to exclude usage
  during model_dump_json serialization instead of reconstructing objects

* fix(proxy): remove per-chunk debug log in async_data_generator

Remove verbose_proxy_logger.debug that formatted every streaming chunk,
which triggered expensive Pydantic serialization on the hot path.

* fix indentation and add clarifying comment for usage stripping

* fix: guard calculate_total_usage against None usage in chunks

* fix: store chunk copy to preserve usage for calculate_total_usage
diff --git a/litellm/litellm_core_utils/streaming_handler.py b/litellm/litellm_core_utils/streaming_handler.py
@@ -2,6 +2,7 @@
 import collections.abc
 import datetime
 import json
+import logging
 import threading
 import time
 import traceback
@@ -435,7 +436,7 @@ def handle_replicate_chunk(self, chunk):
 
     def handle_openai_chat_completion_chunk(self, chunk):
         try:
-            print_verbose(f"\nRaw OpenAI Chunk\n{chunk}\n")
+
             str_line = chunk
             text = ""
             is_finished = False
@@ -485,7 +486,7 @@ def handle_openai_chat_completion_chunk(self, chunk):
 
     def handle_azure_text_completion_chunk(self, chunk):
         try:
-            print_verbose(f"\nRaw OpenAI Chunk\n{chunk}\n")
+
             text = ""
             is_finished = False
             finish_reason = None
@@ -506,7 +507,7 @@ def handle_azure_text_completion_chunk(self, chunk):
 
     def handle_openai_text_completion_chunk(self, chunk):
         try:
-            print_verbose(f"\nRaw OpenAI Chunk\n{chunk}\n")
+
             text = ""
             is_finished = False
             finish_reason = None
@@ -870,9 +871,6 @@ def return_processed_chunk_logic(  # noqa
             preserve_upstream_non_openai_attributes,
         )
 
-        print_verbose(
-            f"completion_obj: {completion_obj}, model_response.choices[0]: {model_response.choices[0]}, response_obj: {response_obj}"
-        )
         is_chunk_non_empty = self.is_chunk_non_empty(
             completion_obj, model_response, response_obj
         )
@@ -899,11 +897,9 @@ def return_processed_chunk_logic(  # noqa
                                     choice_json.pop(
                                         "finish_reason", None
                                     )  # for mistral etc. which return a value in their last chunk (not-openai compatible).
-                                    print_verbose(f"choice_json: {choice_json}")
                                     choices.append(StreamingChoices(**choice_json))
                             except Exception:
                                 choices.append(StreamingChoices())
-                        print_verbose(f"choices in streaming: {choices}")
                         setattr(model_response, "choices", choices)
                     else:
                         return
@@ -921,9 +917,11 @@ def return_processed_chunk_logic(  # noqa
                     )
 
                     model_response = self.strip_role_from_delta(model_response)
-                    verbose_logger.debug(
-                        f"model_response.choices[0].delta inside is_chunk_non_empty: {model_response.choices[0].delta}"
-                    )
+                    if verbose_logger.isEnabledFor(logging.DEBUG):
+                        verbose_logger.debug(
+                            "model_response.choices[0].delta: %s",
+                            model_response.choices[0].delta,
+                        )
                 else:
                     ## else
                     completion_obj["content"] = model_response_str
@@ -1370,9 +1368,6 @@ def chunk_creator(self, chunk: Any):  # type: ignore  # noqa: PLR0915
                         )
 
             model_response.model = self.model
-            print_verbose(
-                f"model_response finish reason 3: {self.received_finish_reason}; response_obj={response_obj}"
-            )
             ## FUNCTION CALL PARSING
             original_chunk = (
                 response_obj.get("original_chunk") if response_obj is not None else None
@@ -1432,7 +1427,6 @@ def chunk_creator(self, chunk: Any):  # type: ignore  # noqa: PLR0915
                                             ):
                                                 t.function.arguments = ""
                             _json_delta = delta.model_dump()
-                            print_verbose(f"_json_delta: {_json_delta}")
                             if "role" not in _json_delta or _json_delta["role"] is None:
                                 _json_delta[
                                     "role"
@@ -1466,11 +1460,7 @@ def chunk_creator(self, chunk: Any):  # type: ignore  # noqa: PLR0915
                                 if original_chunk.choices[0].delta is None
                                 else dict(original_chunk.choices[0].delta)
                             )
-                            print_verbose(f"original delta: {delta}")
                             model_response.choices[0].delta = Delta(**delta)
-                            print_verbose(
-                                f"new delta: {model_response.choices[0].delta}"
-                            )
                         except Exception:
                             model_response.choices[0].delta = Delta()
                 else:
@@ -1480,11 +1470,6 @@ def chunk_creator(self, chunk: Any):  # type: ignore  # noqa: PLR0915
                     ):
                         return model_response
                     return
-            print_verbose(
-                f"model_response.choices[0].delta: {model_response.choices[0].delta}; completion_obj: {completion_obj}"
-            )
-            print_verbose(f"self.sent_first_chunk: {self.sent_first_chunk}")
-
             ## CHECK FOR TOOL USE
 
             if "tool_calls" in completion_obj and len(completion_obj["tool_calls"]) > 0:
@@ -1915,18 +1900,9 @@ async def __anext__(self):  # noqa: PLR0915
                         and len(chunk.parts) == 0
                     ):
                         continue
-                    # chunk_creator() does logging/stream chunk building. We need to let it know its being called in_async_func, so we don't double add chunks.
-                    # __anext__ also calls async_success_handler, which does logging
-                    verbose_logger.debug(
-                        f"PROCESSED ASYNC CHUNK PRE CHUNK CREATOR: {chunk}"
-                    )
-
                     processed_chunk: Optional[ModelResponseStream] = self.chunk_creator(
                         chunk=chunk
                     )
-                    verbose_logger.debug(
-                        f"PROCESSED ASYNC CHUNK POST CHUNK CREATOR: {processed_chunk}"
-                    )
                     if processed_chunk is None:
                         continue
 
@@ -1943,31 +1919,28 @@ async def __anext__(self):  # noqa: PLR0915
                     self.rules.post_call_rules(
                         input=self.response_uptil_now, model=self.model
                     )
-                    self.chunks.append(processed_chunk)
-                    
+                    # Store a shallow copy so usage stripping below
+                    # does not mutate the stored chunk.
+                    self.chunks.append(processed_chunk.model_copy())
+
                     # Add mcp_list_tools to first chunk if present
                     if not self.sent_first_chunk:
                         processed_chunk = self._add_mcp_list_tools_to_first_chunk(processed_chunk)
                         self.sent_first_chunk = True
-                    if hasattr(
-                        processed_chunk, "usage"
-                    ):  # remove usage from chunk, only send on final chunk
-                        # Convert the object to a dictionary
-                        obj_dict = processed_chunk.model_dump()
-
-                        # Remove an attribute (e.g., 'attr2')
-                        if "usage" in obj_dict:
-                            del obj_dict["usage"]
-
-                        # Create a new object without the removed attribute
-                        processed_chunk = self.model_response_creator(chunk=obj_dict)
+                    if (
+                        hasattr(processed_chunk, "usage")
+                        and getattr(processed_chunk, "usage", None) is not None
+                    ):
+                        # Strip usage from the outgoing chunk so
+                        # model_dump_json(exclude_none=True) drops it.
+                        # The copy in self.chunks retains usage for
+                        # calculate_total_usage().
+                        processed_chunk.usage = None  # type: ignore
                         is_empty = is_model_response_stream_empty(
                             model_response=cast(ModelResponseStream, processed_chunk)
                         )
-
                         if is_empty:
                             continue
-                    print_verbose(f"final returned processed chunk: {processed_chunk}")
 
                     # add usage as hidden param
                     if self.sent_last_chunk is True and self.stream_options is None:
@@ -1982,7 +1955,7 @@ async def __anext__(self):  # noqa: PLR0915
                             )
                         )
                         # Add MCP metadata to final chunk if present (after hooks)
-                        processed_chunk = self._add_mcp_metadata_to_final_chunk(processed_chunk)
+                        processed_chunk = self._add_mcp_metadata_to_final_chunk(processed_chunk)  # type: ignore[reportArgumentType]
 
                     return processed_chunk
                 raise StopAsyncIteration
@@ -1996,13 +1969,9 @@ async def __anext__(self):  # noqa: PLR0915
                     else:
                         chunk = next(self.completion_stream)
                     if chunk is not None and chunk != b"":
-                        print_verbose(f"PROCESSED CHUNK PRE CHUNK CREATOR: {chunk}")
                         processed_chunk: Optional[
                             ModelResponseStream
                         ] = self.chunk_creator(chunk=chunk)
-                        print_verbose(
-                            f"PROCESSED CHUNK POST CHUNK CREATOR: {processed_chunk}"
-                        )
                         if processed_chunk is None:
                             continue
 
@@ -2193,7 +2162,7 @@ def calculate_total_usage(chunks: List[ModelResponse]) -> Usage:
     prompt_tokens: int = 0
     completion_tokens: int = 0
     for chunk in chunks:
-        if "usage" in chunk:
+        if "usage" in chunk and chunk["usage"] is not None:
             if "prompt_tokens" in chunk["usage"]:
                 prompt_tokens = chunk["usage"].get("prompt_tokens", 0) or 0
             if "completion_tokens" in chunk["usage"]:
diff --git a/litellm/proxy/middleware/prometheus_auth_middleware.py b/litellm/proxy/middleware/prometheus_auth_middleware.py
@@ -1,16 +1,21 @@
 """
 Prometheus Auth Middleware
+
+Pure ASGI middleware — avoids Starlette's BaseHTTPMiddleware which wraps
+streaming responses with receive_or_disconnect per chunk, blocking the
+event loop and causing severe throughput degradation under concurrent
+streaming load.
 """
-from fastapi import Request
-from fastapi.responses import JSONResponse
-from starlette.middleware.base import BaseHTTPMiddleware
+from starlette.requests import Request
+from starlette.responses import JSONResponse
+from starlette.types import ASGIApp, Receive, Scope, Send
 
 import litellm
 from litellm.proxy._types import SpecialHeaders
 from litellm.proxy.auth.user_api_key_auth import user_api_key_auth
 
 
-class PrometheusAuthMiddleware(BaseHTTPMiddleware):
+class PrometheusAuthMiddleware:
     """
     Middleware to authenticate requests to the metrics endpoint
 
@@ -24,8 +29,15 @@ class PrometheusAuthMiddleware(BaseHTTPMiddleware):
     ```
     """
 
-    async def dispatch(self, request: Request, call_next):
-        # Check if this is a request to the metrics endpoint
+    def __init__(self, app: ASGIApp) -> None:
+        self.app = app
+
+    async def __call__(self, scope: Scope, receive: Receive, send: Send) -> None:
+        if scope["type"] not in ("http", "websocket"):
+            await self.app(scope, receive, send)
+            return
+
+        request = Request(scope, receive)
 
         if self._is_prometheus_metrics_endpoint(request):
             if self._should_run_auth_on_metrics_endpoint() is True:
@@ -38,15 +50,14 @@ async def dispatch(self, request: Request, call_next):
                         or "",
                     )
                 except Exception as e:
-                    return JSONResponse(
+                    response = JSONResponse(
                         status_code=401,
                         content=f"Unauthorized access to metrics endpoint: {getattr(e, 'message', str(e))}",
                     )
+                    await response(scope, receive, send)
+                    return
 
-        # Process the request and get the response
-        response = await call_next(request)
-
-        return response
+        await self.app(scope, receive, send)
 
     @staticmethod
     def _is_prometheus_metrics_endpoint(request: Request):
diff --git a/litellm/proxy/proxy_server.py b/litellm/proxy/proxy_server.py
@@ -5074,10 +5074,6 @@ async def async_data_generator(
             response=response,
             request_data=request_data,
         ):
-            verbose_proxy_logger.debug(
-                "async_data_generator: received streaming chunk - {}".format(chunk)
-            )
-
             ### CALL HOOKS ### - modify outgoing data
             chunk = await proxy_logging_obj.async_post_call_streaming_hook(
                 user_api_key_dict=user_api_key_dict,
diff --git a/tests/test_litellm/proxy/middleware/test_prometheus_auth_middleware_asgi.py b/tests/test_litellm/proxy/middleware/test_prometheus_auth_middleware_asgi.py
@@ -0,0 +1,24 @@
+"""
+Tests that PrometheusAuthMiddleware is a pure ASGI middleware (not BaseHTTPMiddleware).
+
+BaseHTTPMiddleware wraps streaming responses with receive_or_disconnect per chunk,
+which blocks the event loop and causes severe throughput degradation.
+"""
+from starlette.middleware.base import BaseHTTPMiddleware
+
+from litellm.proxy.middleware.prometheus_auth_middleware import PrometheusAuthMiddleware
+
+
+def test_is_not_base_http_middleware():
+    """PrometheusAuthMiddleware must NOT inherit from BaseHTTPMiddleware."""
+    assert not issubclass(PrometheusAuthMiddleware, BaseHTTPMiddleware), (
+        "PrometheusAuthMiddleware should be a pure ASGI middleware, not BaseHTTPMiddleware. "
+        "BaseHTTPMiddleware causes severe streaming performance degradation."
+    )
+
+
+def test_has_asgi_call_protocol():
+    """PrometheusAuthMiddleware must implement the ASGI __call__ protocol."""
+    assert "__call__" in PrometheusAuthMiddleware.__dict__, (
+        "PrometheusAuthMiddleware must define __call__(self, scope, receive, send)"
+    )