Kiln-AI · leonardmq · Feb 20, 2026 · Feb 20, 2026 · Feb 20, 2026 · Feb 20, 2026
diff --git a/libs/core/kiln_ai/adapters/litellm_utils/__init__.py b/libs/core/kiln_ai/adapters/litellm_utils/__init__.py
diff --git a/libs/core/kiln_ai/adapters/litellm_utils/litellm_streaming.py b/libs/core/kiln_ai/adapters/litellm_utils/litellm_streaming.py
@@ -0,0 +1,60 @@
+from __future__ import annotations
+
+from typing import Any, AsyncIterator, Optional, Union
+
+import litellm
+from litellm.types.utils import (
+    ModelResponse,
+    ModelResponseStream,
+    TextCompletionResponse,
+)
+
+
+class StreamingCompletion:
+    """
+    Async iterable wrapper around ``litellm.acompletion`` with streaming.
+
+    Yields ``ModelResponseStream`` chunks as they arrive.  After iteration
+    completes, the assembled ``ModelResponse`` is available via the
+    ``.response`` property.
+
+    Usage::
+
+        stream = StreamingCompletion(model=..., messages=...)
+        async for chunk in stream:
+            # handle chunk however you like (print, log, send over WS, …)
+            pass
+        final = stream.response   # fully assembled ModelResponse
+    """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        kwargs = dict(kwargs)
+        kwargs.pop("stream", None)
+        self._args = args
+        self._kwargs = kwargs
+        self._response: Optional[Union[ModelResponse, TextCompletionResponse]] = None
+        self._iterated: bool = False
+
+    @property
+    def response(self) -> Optional[Union[ModelResponse, TextCompletionResponse]]:
+        """The final assembled response. Only available after iteration."""
+        if not self._iterated:
+            raise RuntimeError(
+                "StreamingCompletion has not been iterated yet. "
+                "Use 'async for chunk in stream:' before accessing .response"
+            )
+        return self._response
+
+    async def __aiter__(self) -> AsyncIterator[ModelResponseStream]:
+        self._response = None
+        self._iterated = False
+
+        chunks: list[ModelResponseStream] = []
+        stream = await litellm.acompletion(*self._args, stream=True, **self._kwargs)
+
+        async for chunk in stream:
+            chunks.append(chunk)
+            yield chunk
+
+        self._response = litellm.stream_chunk_builder(chunks)
+        self._iterated = True
diff --git a/libs/core/kiln_ai/adapters/litellm_utils/test_litellm_streaming.py b/libs/core/kiln_ai/adapters/litellm_utils/test_litellm_streaming.py
@@ -0,0 +1,139 @@
+from __future__ import annotations
+
+from types import SimpleNamespace
+from typing import Any, List
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from kiln_ai.adapters.litellm_utils.litellm_streaming import StreamingCompletion
+
+
+def _make_chunk(content: str | None = None, finish_reason: str | None = None) -> Any:
+    """Build a minimal chunk object matching litellm's streaming shape."""
+    delta = SimpleNamespace(content=content, role="assistant")
+    choice = SimpleNamespace(delta=delta, finish_reason=finish_reason, index=0)
+    return SimpleNamespace(choices=[choice], id="chatcmpl-test", model="test-model")
+
+
+async def _async_iter(items: List[Any]):
+    """Turn a plain list into an async iterator."""
+    for item in items:
+        yield item
+
+
+@pytest.fixture
+def mock_acompletion():
+    with patch("litellm.acompletion") as mock:
+        yield mock
+
+
+@pytest.fixture
+def mock_chunk_builder():
+    with patch("litellm.stream_chunk_builder") as mock:
+        yield mock
+
+
+class TestStreamingCompletion:
+    async def test_yields_all_chunks(self, mock_acompletion, mock_chunk_builder):
+        chunks = [_make_chunk("Hello"), _make_chunk(" world"), _make_chunk("!")]
+        mock_acompletion.return_value = _async_iter(chunks)
+        mock_chunk_builder.return_value = MagicMock(name="final_response")
+
+        stream = StreamingCompletion(model="test", messages=[])
+        received = [chunk async for chunk in stream]
+
+        assert received == chunks
+
+    async def test_response_available_after_iteration(
+        self, mock_acompletion, mock_chunk_builder
+    ):
+        chunks = [_make_chunk("hi")]
+        mock_acompletion.return_value = _async_iter(chunks)
+        sentinel = MagicMock(name="final_response")
+        mock_chunk_builder.return_value = sentinel
+
+        stream = StreamingCompletion(model="test", messages=[])
+        async for _ in stream:
+            pass
+
+        assert stream.response is sentinel
+
+    async def test_response_raises_before_iteration(self):
+        stream = StreamingCompletion(model="test", messages=[])
+        with pytest.raises(RuntimeError, match="not been iterated"):
+            _ = stream.response
+
+    async def test_stream_kwarg_is_stripped(self, mock_acompletion, mock_chunk_builder):
+        mock_acompletion.return_value = _async_iter([])
+        mock_chunk_builder.return_value = None
+
+        stream = StreamingCompletion(model="test", messages=[], stream=False)
+        async for _ in stream:
+            pass
+
+        _, call_kwargs = mock_acompletion.call_args
+        assert call_kwargs["stream"] is True
+
+    async def test_passes_args_and_kwargs_through(
+        self, mock_acompletion, mock_chunk_builder
+    ):
+        mock_acompletion.return_value = _async_iter([])
+        mock_chunk_builder.return_value = None
+
+        stream = StreamingCompletion(
+            model="gpt-4", messages=[{"role": "user", "content": "hi"}], temperature=0.5
+        )
+        async for _ in stream:
+            pass
+
+        _, call_kwargs = mock_acompletion.call_args
+        assert call_kwargs["model"] == "gpt-4"
+        assert call_kwargs["messages"] == [{"role": "user", "content": "hi"}]
+        assert call_kwargs["temperature"] == 0.5
+        assert call_kwargs["stream"] is True
+
+    async def test_chunks_passed_to_builder(self, mock_acompletion, mock_chunk_builder):
+        chunks = [_make_chunk("a"), _make_chunk("b")]
+        mock_acompletion.return_value = _async_iter(chunks)
+        mock_chunk_builder.return_value = MagicMock()
+
+        stream = StreamingCompletion(model="test", messages=[])
+        async for _ in stream:
+            pass
+
+        mock_chunk_builder.assert_called_once_with(chunks)
+
+    async def test_re_iteration_resets_state(
+        self, mock_acompletion, mock_chunk_builder
+    ):
+        first_chunks = [_make_chunk("first")]
+        second_chunks = [_make_chunk("second")]
+        first_response = MagicMock(name="first_response")
+        second_response = MagicMock(name="second_response")
+
+        mock_acompletion.side_effect = [
+            _async_iter(first_chunks),
+            _async_iter(second_chunks),
+        ]
+        mock_chunk_builder.side_effect = [first_response, second_response]
+
+        stream = StreamingCompletion(model="test", messages=[])
+
+        async for _ in stream:
+            pass
+        assert stream.response is first_response
+
+        async for _ in stream:
+            pass
+        assert stream.response is second_response
+
+    async def test_empty_stream(self, mock_acompletion, mock_chunk_builder):
+        mock_acompletion.return_value = _async_iter([])
+        mock_chunk_builder.return_value = None
+
+        stream = StreamingCompletion(model="test", messages=[])
+        received = [chunk async for chunk in stream]
+
+        assert received == []
+        assert stream.response is None
diff --git a/libs/core/kiln_ai/adapters/model_adapters/base_adapter.py b/libs/core/kiln_ai/adapters/model_adapters/base_adapter.py
@@ -1,8 +1,11 @@
 import json
 from abc import ABCMeta, abstractmethod
+from collections.abc import Awaitable, Callable
 from dataclasses import dataclass
 from typing import Dict, Tuple
 
+from litellm.types.utils import ModelResponseStream
+
 from kiln_ai.adapters.chat.chat_formatter import (
     ChatFormatter,
     MultiturnFormatter,
@@ -49,6 +52,8 @@
 from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
 from kiln_ai.utils.open_ai_types import ChatCompletionMessageParam
 
+StreamCallback = Callable[[ModelResponseStream], Awaitable[None]]
+
 
 @dataclass
 class AdapterConfig:
@@ -128,9 +133,10 @@ async def invoke(
         input: InputType,
         input_source: DataSource | None = None,
         existing_run: TaskRun | None = None,
+        on_chunk: StreamCallback | None = None,
     ) -> TaskRun:
         run_output, _ = await self.invoke_returning_run_output(
-            input, input_source, existing_run
+            input, input_source, existing_run, on_chunk=on_chunk
         )
         return run_output
 
@@ -139,6 +145,7 @@ async def _run_returning_run_output(
         input: InputType,
         input_source: DataSource | None = None,
         existing_run: TaskRun | None = None,
+        on_chunk: StreamCallback | None = None,
     ) -> Tuple[TaskRun, RunOutput]:
         # validate input, allowing arrays
         if self.input_schema is not None:
@@ -166,7 +173,9 @@ async def _run_returning_run_output(
             formatted_input = formatter.format_input(input)
 
         # Run
-        run_output, usage = await self._run(formatted_input, prior_trace=prior_trace)
+        run_output, usage = await self._run(
+            formatted_input, prior_trace=prior_trace, on_chunk=on_chunk
+        )
 
         # Parse
         provider = self.model_provider()
@@ -256,6 +265,7 @@ async def invoke_returning_run_output(
         input: InputType,
         input_source: DataSource | None = None,
         existing_run: TaskRun | None = None,
+        on_chunk: StreamCallback | None = None,
     ) -> Tuple[TaskRun, RunOutput]:
         # Determine if this is the root agent (no existing run context)
         is_root_agent = get_agent_run_id() is None
@@ -266,7 +276,7 @@ async def invoke_returning_run_output(
 
         try:
             return await self._run_returning_run_output(
-                input, input_source, existing_run
+                input, input_source, existing_run, on_chunk=on_chunk
             )
         finally:
             if is_root_agent:
@@ -289,6 +299,7 @@ async def _run(
         self,
         input: InputType,
         prior_trace: list[ChatCompletionMessageParam] | None = None,
+        on_chunk: StreamCallback | None = None,
     ) -> Tuple[RunOutput, Usage | None]:
         pass
 

diff --git a/libs/core/kiln_ai/adapters/model_adapters/litellm_adapter.py b/libs/core/kiln_ai/adapters/model_adapters/litellm_adapter.py
@@ -5,7 +5,6 @@
 from dataclasses import dataclass
 from typing import Any, Dict, List, Tuple
 
-import litellm
 from litellm.types.utils import (
     ChatCompletionMessageToolCall,
     ChoiceLogprobs,
@@ -20,6 +19,7 @@
 
 import kiln_ai.datamodel as datamodel
 from kiln_ai.adapters.chat import ChatCompletionMessageIncludingLiteLLM
+from kiln_ai.adapters.litellm_utils.litellm_streaming import StreamingCompletion
 from kiln_ai.adapters.ml_model_list import (
     KilnModelProvider,
     ModelProviderName,
@@ -29,6 +29,7 @@
     AdapterConfig,
     BaseAdapter,
     RunOutput,
+    StreamCallback,
     Usage,
 )
 from kiln_ai.adapters.model_adapters.litellm_config import LiteLlmConfig
@@ -95,6 +96,7 @@ async def _run_model_turn(
         prior_messages: list[ChatCompletionMessageIncludingLiteLLM],
         top_logprobs: int | None,
         skip_response_format: bool,
+        on_chunk: StreamCallback | None = None,
     ) -> ModelTurnResult:
         """
         Call the model for a single top level turn: from user message to agent message.
@@ -118,7 +120,7 @@ async def _run_model_turn(
 
             # Make the completion call
             model_response, response_choice = await self.acompletion_checking_response(
-                **completion_kwargs
+                on_chunk=on_chunk, **completion_kwargs
             )
 
             # count the usage
@@ -185,6 +187,7 @@ async def _run(
         self,
         input: InputType,
         prior_trace: list[ChatCompletionMessageParam] | None = None,
+        on_chunk: StreamCallback | None = None,
     ) -> tuple[RunOutput, Usage | None]:
         usage = Usage()
 
@@ -229,6 +232,7 @@ async def _run(
                 messages,
                 self.base_adapter_config.top_logprobs if turn.final_call else None,
                 skip_response_format,
+                on_chunk=on_chunk,
             )
 
             usage += turn_result.usage
@@ -297,9 +301,14 @@ def _extract_reasoning_to_intermediate_outputs(
                     intermediate_outputs["reasoning"] = stripped_reasoning_content
 
     async def acompletion_checking_response(
-        self, **kwargs
+        self, on_chunk: StreamCallback | None = None, **kwargs
     ) -> Tuple[ModelResponse, Choices]:
-        response = await litellm.acompletion(**kwargs)
+        stream = StreamingCompletion(**kwargs)
+        async for chunk in stream:
+            if on_chunk is not None:
+                await on_chunk(chunk)
+        response = stream.response
+
         if (
             not isinstance(response, ModelResponse)
             or not response.choices

diff --git a/libs/core/kiln_ai/adapters/model_adapters/mcp_adapter.py b/libs/core/kiln_ai/adapters/model_adapters/mcp_adapter.py
@@ -1,7 +1,11 @@
 import json
 from typing import Tuple
 
-from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig, BaseAdapter
+from kiln_ai.adapters.model_adapters.base_adapter import (
+    AdapterConfig,
+    BaseAdapter,
+    StreamCallback,
+)
 from kiln_ai.adapters.parsers.json_parser import parse_json_string
 from kiln_ai.adapters.run_output import RunOutput
 from kiln_ai.datamodel import DataSource, Task, TaskRun, Usage
@@ -46,6 +50,7 @@ async def _run(
         self,
         input: InputType,
         prior_trace: list[ChatCompletionMessageParam] | None = None,
+        on_chunk: StreamCallback | None = None,
     ) -> Tuple[RunOutput, Usage | None]:
         if prior_trace is not None:
             raise NotImplementedError(
@@ -86,6 +91,7 @@ async def invoke(
         input: InputType,
         input_source: DataSource | None = None,
         existing_run: TaskRun | None = None,
+        on_chunk: StreamCallback | None = None,
     ) -> TaskRun:
         if existing_run is not None:
             raise NotImplementedError(
@@ -94,7 +100,7 @@ async def invoke(
             )
 
         run_output, _ = await self.invoke_returning_run_output(
-            input, input_source, existing_run
+            input, input_source, existing_run, on_chunk=on_chunk
         )
         return run_output
 
@@ -103,6 +109,7 @@ async def invoke_returning_run_output(
         input: InputType,
         input_source: DataSource | None = None,
         existing_run: TaskRun | None = None,
+        on_chunk: StreamCallback | None = None,
     ) -> Tuple[TaskRun, RunOutput]:
         """
         Runs the task and returns both the persisted TaskRun and raw RunOutput.