withmartian · joshgreaves · Mar 13, 2026 · Mar 16, 2026 · Mar 16, 2026 · Mar 26, 2026
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -173,9 +173,9 @@ class Container(Protocol):
 #### 4. LLM Clients (`src/ares/llms/`)
 
 **Core Abstractions:**
-- `LLMRequest` - Dataclass with messages and optional temperature
-- `LLMResponse` - Dataclass with ChatCompletion and cost tracking
-- `LLMClient` Protocol - `async def __call__(request: LLMRequest) -> LLMResponse`
+- `lft.OpenResponsesRequest` - Canonical Open Responses request (from linguafranca) used for observations and client inputs
+- `InferenceResult` - Dataclass wrapping `lft.OpenResponsesResponse` with cost tracking
+- `LLMClient` Protocol - `async def __call__(request: lft.OpenResponsesRequest) -> InferenceResult`
 
 **Key Pattern: Queue-Mediated LLM Client (`queue_mediated_client.py`):**
 
@@ -281,12 +281,13 @@ Follow Google-style isort configuration:
 - **Always import modules, not classes or functions**
 - **External consumers** (examples, docs):
   - ✅ Good: `import ares` → use `ares.make(...)`
-  - ✅ Good: `from ares import llms` → use `llms.LLMRequest`, `llms.TextData`
-  - ❌ Avoid: `from ares.llms import LLMRequest, TextData`
+  - ✅ Good: `from ares.llms import open_responses` → use `open_responses.make_request(...)`
+  - ✅ Good: `from ares import llms` → use `llms.TextData`, `llms.Usage`
+  - ❌ Avoid: `from ares.llms import OpenResponsesRequest, TextData`
 - **Internal code**:
-  - ✅ Good: `from ares.llms import request` → use `request.LLMRequest`
+  - ✅ Good: `from ares.llms import open_responses` → use `open_responses.make_request(...)`
   - ✅ Good: `from ares.llms import response` → use `response.TextData`, `response.Usage`
-  - ❌ Avoid: `from ares.llms.request import LLMRequest`
+  - ❌ Avoid: `from ares.llms.open_responses import Request`
   - ❌ Avoid: `from ares.llms.response import TextData, Usage`
 - Rationale: Makes code more readable and explicit about where objects come from
 

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -103,21 +103,21 @@ Follow **Google-style imports**: always import modules, not individual classes o
 ```python
 # Good ✅
 import ares
-from ares import llms
+from ares.llms import open_responses
 
-request = llms.LLMRequest(messages=[...])
+request = open_responses.make_request([open_responses.user_message("Hello")])
 env = ares.make("sbv-mswea")
 
 # Good for internal code ✅
-from ares.llms import request
+from ares.llms import open_responses
 from ares.llms import response
 
-req = request.LLMRequest(messages=[...])
-resp = response.LLMResponse(data=[...], cost=0.0, usage=...)
+req = open_responses.make_request([open_responses.user_message("Hello")])
+resp = response.InferenceResult(response=response.make_response("Hello!"), cost=0.0)
 
 # Avoid ❌
-from ares.llms import LLMRequest, TextData
-from ares.llms.request import LLMRequest
+from ares.llms import OpenResponsesRequest, TextData
+from ares.llms.open_responses import Request
 ```
 
 **Rationale:** Makes code more readable and explicit about where objects come from.

diff --git a/README.md b/README.md
@@ -15,7 +15,7 @@ ARES is an RL-first framework for training and evaluating LLM agents, especially
 
 It is a modern [gym](https://github.com/Farama-Foundation/Gymnasium): the environment layer powering RL research.
 
-ARES treats LLMRequests as observations and LLMResponses as actions within the environment, so you can focus on training just the LLM - not the Code Agent surrounding it. The interface is entirely async, and supports scaling up to hundreds or thousands of parallel environments easily - check out [example 3](https://github.com/withmartian/ares/tree/main/examples/03_parallel_eval_with_api.py) to run this yourself.
+ARES treats Open Responses requests as observations and LLMResponses as actions within the environment, so you can focus on training just the LLM - not the Code Agent surrounding it. The interface is entirely async, and supports scaling up to hundreds or thousands of parallel environments easily - check out [example 3](https://github.com/withmartian/ares/tree/main/examples/03_parallel_eval_with_api.py) to run this yourself.
 
 
 ## Quick Start

diff --git a/docs/source/core-concepts.rst b/docs/source/core-concepts.rst
@@ -11,7 +11,7 @@ It's important to understand two different concepts in ARES:
     The orchestration logic that uses a Container and LLM to solve tasks (e.g., MiniSWECodeAgent). This is **part of the environment** and remains fixed during training. Think of it as the scaffold that defines how an LLM interacts with code.
 
 * Agent/Policy (Trained)
-    The component you're actually training - a function that maps ``LLMRequest → LLMResponse``. This could be a fine-tuned LLM, a prompt optimizer, or any policy that produces better responses. This is what improves through reinforcement learning.
+    The component you're actually training - a function that maps ``OpenResponsesRequest → InferenceResult``. This could be a fine-tuned LLM, a prompt optimizer, or any policy that produces better responses. This is what improves through reinforcement learning.
 
 System Architecture
 -------------------
@@ -30,13 +30,13 @@ Here's how the components fit together:
     |  generates response    |            │                                      │
     └──────────┬─────────────┘            │  ┌────────────────────────────────┐  │
         ^      │                          │  │   QueueMediatedLLMClient       │  │
-        |      │ LLMResponse (action)     │  │                                │  │
+        |      │ InferenceResult (action) │  │                                │  │
         |      └──────────────────────────┼─>│   Intercepts LLM calls         │  │
         |                                 │  │   from code agent via          │  │
         └─────────────────────────────────┼──│   QueueMediatedLLMClient       │  │
-                 LLMRequest (observation) │  └──────────────────┬─────────────┘  │
+           Open Responses observation     │  └──────────────────┬─────────────┘  │
                                           │                 ^   │                │
-                                          │      LLMRequest │   │ LLMResponse    │
+                                          │ Open Responses  │   │ InferenceResult│
                                           │                 │   v                │
                                           │  ┌──────────────└─────────────────┐  │
                                           │  │       CodeAgent                │  │
@@ -87,7 +87,7 @@ The key abstraction is ``CodeEnvironment``, which:
 * **Exposes LLM requests as observations** - Intercepts calls from the code agent
 * **Treats LLM responses as actions** - Your trainable agent/policy provides responses
 
-Crucially, the **CodeAgent is part of the environment**, not what you're training. Your training loop optimizes an agent/policy that produces better ``LLMResponse`` outputs given ``LLMRequest`` observations.
+Crucially, the **CodeAgent is part of the environment**, not what you're training. Your training loop optimizes an agent/policy that produces better ``InferenceResult`` outputs given canonical Open Responses observations.
 
 Standard RL Loop
 ~~~~~~~~~~~~~~~~
@@ -101,10 +101,10 @@ Every environment follows the standard RL pattern:
         timestep = await env.reset()
 
         while not timestep.last():
-            # timestep.observation is an LLMRequest from the code agent
+            # timestep.observation is an Open Responses request from the code agent
             action = await your_policy(timestep.observation)
 
-            # action is an LLMResponse that continues the agent's execution
+            # action is an InferenceResult that continues the agent's execution
             timestep = await env.step(action)
 
         # timestep.reward contains the reward for the final step
@@ -116,7 +116,7 @@ TimeStep Structure
 Each call to ``reset()`` or ``step()`` returns a ``TimeStep`` with:
 
 * ``step_type``: One of ``"FIRST"``, ``"MID"``, or ``"LAST"``
-* ``observation``: An ``LLMRequest`` object (or ``None`` on termination)
+* ``observation``: An Open Responses request object (or ``None`` on termination)
 * ``reward``: A float reward for each step
 * ``discount``: A float discount factor for RL algorithms
 
@@ -160,7 +160,7 @@ Example structure:
         async def run(self, task: str) -> None:
             while not self.is_done():
                 # Ask LLM what to do next
-                request = LLMRequest(messages=[...])
+                request = open_responses.make_request([open_responses.user_message(...)])
                 response = await self._llm_client(request)
 
                 # Parse and execute commands from LLM response
@@ -234,8 +234,8 @@ Which you will need to rewrite into something like:
                 # Decide what to ask LLM next
                 ...
                 llm_response = await self.llm_client(
-                    LLMRequest(
-                        messages=[...],
+                    open_responses.make_request(
+                        [open_responses.user_message(...)],
                         ...  # Other request params
                     )
                 )
@@ -293,30 +293,27 @@ Core Interface
 
 .. code-block:: python
 
+    from linguafranca import types as lft
+
     class LLMClient(Protocol):
-        async def __call__(self, request: LLMRequest) -> LLMResponse:
+        async def __call__(self, request: lft.OpenResponsesRequest) -> InferenceResult:
             ...
 
     @dataclass(frozen=True)
-    class LLMRequest:
-        messages: Iterable[ChatCompletionMessageParam]
-        temperature: float | None = None
-
-    @dataclass(frozen=True)
-    class LLMResponse:
-        chat_completion_response: ChatCompletion
+    class InferenceResult:
+        response: lft.OpenResponsesResponse
         cost: float
 
-This simple interface wraps OpenAI-style chat completion APIs. The ``messages`` field follows the OpenAI format with ``role`` (system/user/assistant) and ``content``.
+ARES uses linguafranca's ``OpenResponsesRequest`` as the canonical request type for observations and client inputs. Edge adapters convert to Chat/Responses/Anthropic formats only when needed.
 
 Why LLMClient?
 ~~~~~~~~~~~~~~
 
 The ``LLMClient`` abstraction serves two purposes:
 
-1. **Observations = LLM Requests**: In the RL loop, ``timestep.observation`` is an ``LLMRequest`` containing the messages the code agent wants to send to the LLM. This is the "state" your policy observes.
+1. **Observations = Open Responses requests**: In the RL loop, ``timestep.observation`` is a canonical Open Responses request containing what the code agent wants to send to the LLM. This is the "state" your policy observes.
 
-2. **Actions = LLM Responses**: In the RL loop, the ``action`` you pass to ``env.step()`` is an ``LLMResponse`` containing the LLM's reply. This is how your policy controls the agent's behavior.
+2. **Actions = LLM Responses**: In the RL loop, the ``action`` you pass to ``env.step()`` is an ``InferenceResult`` containing the LLM's reply. This is how your policy controls the agent's behavior.
 
 This framing makes it natural to think about code agent training as an RL problem: you're learning a policy that maps agent requests to helpful responses.
 

diff --git a/docs/source/how-it-works.rst b/docs/source/how-it-works.rst
@@ -28,7 +28,7 @@ The ``QueueMediatedLLMClient`` implements the ``LLMClient`` protocol, but instea
 
 Meanwhile, the environment:
 
-1. **Watches the queue**: Extracts ``LLMRequest`` objects as they arrive
+1. **Watches the queue**: Extracts canonical Open Responses requests as they arrive
 2. **Exposes them as observations**: Returns them from ``reset()`` and ``step()``
 3. **Provides responses**: When you call ``step(action)``, sets the Future's result
 
@@ -39,12 +39,14 @@ The core implementation is simple:
 
 .. code-block:: python
 
+    from linguafranca import types as lft
+
     @dataclass(frozen=True)
     class QueueMediatedLLMClient(LLMClient):
-        q: asyncio.Queue[ValueAndFuture[LLMRequest, LLMResponse]]
+        q: asyncio.Queue[ValueAndFuture[lft.OpenResponsesRequest, InferenceResult]]
 
-        async def __call__(self, request: LLMRequest) -> LLMResponse:
-            future = asyncio.Future[LLMResponse]()
+        async def __call__(self, request: lft.OpenResponsesRequest) -> InferenceResult:
+            future = asyncio.Future[InferenceResult]()
             await self.q.put(ValueAndFuture(value=request, future=future))
             return await future  # Blocks until env provides response
 
@@ -65,7 +67,7 @@ The environment side:
             self._llm_req_future = value_and_future.future
             return TimeStep(step_type="MID", observation=value_and_future.value, ...)
 
-    async def step(self, action: LLMResponse) -> TimeStep:
+    async def step(self, action: InferenceResult) -> TimeStep:
         # Unblock the code agent by providing response
         self._llm_req_future.set_result(action)
         return await self._get_time_step()

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -20,10 +20,10 @@ See the main `README <https://github.com/withmartian/ares>`_ for installation in
 Key Features
 ------------
 
-* **RL-First Design**: Built around the reinforcement learning loop with observations (LLM requests) and actions (LLM responses)
+* **RL-First Design**: Built around the reinforcement learning loop with observations (Open Responses requests) and actions (LLM responses)
 * **LLM-Level Optimization**: Train the LLM within code agents, not just the agent as a whole
 * **Distributed Workloads**: Support for high-volume, distributed training and evaluation
-* **Mechanistic Interpretability**: Raw access to LLM requests and responses for deep analysis
+* **Mechanistic Interpretability**: Raw access to canonical LLM requests and responses for deep analysis
 * **Async Gym/dm_env like Spec**: Close to Gym/dm_env spec, but incorporating async methods for performance 
 
 Indices and tables

diff --git a/examples/04_rl_training_with_skyrl.py b/examples/04_rl_training_with_skyrl.py
@@ -48,7 +48,9 @@
 
 import ares
 from ares import llms
+from ares.llms import open_responses
 import hydra
+from linguafranca import types as lft
 import omegaconf
 import ray
 import skyrl_gym
@@ -91,7 +93,7 @@ def __init__(self, env_config: dict | None = None, extras: dict | None = None, *
         self.preset_name = extras.get("preset_name", kwargs.get("preset_name"))
         if not self.preset_name:
             raise ValueError("preset_name must be provided in extras or kwargs")
-        self.env: ares.Environment[llms.LLMResponse, llms.LLMRequest, float, float] | None = None
+        self.env: ares.Environment[llms.InferenceResult, lft.OpenResponsesRequest, float, float] | None = None
 
     async def init(
         self, prompt: base_text_env.ConversationType
@@ -104,7 +106,8 @@ async def init(
         await self.env.__aenter__()
         ts = await self.env.reset()
 
-        return ts.observation.messages, {}  # type: ignore
+        assert ts.observation is not None
+        return open_responses.to_chat_messages(ts.observation, strict=True), {}
 
     async def step(self, action: str) -> base_text_env.BaseTextEnvStepOutput:
         """Runs one environment step.
@@ -119,18 +122,17 @@ async def step(self, action: str) -> base_text_env.BaseTextEnvStepOutput:
         """
         assert self.env is not None
 
-        llm_resp = llms.LLMResponse(
-            data=[llms.TextData(content=action)],
+        llm_resp = llms.InferenceResult(
+            response=llms.make_response(action),
             cost=0.0,
-            usage=llms.Usage(prompt_tokens=-1, generated_tokens=-1),
         )
         ts = await self.env.step(llm_resp)
 
         if ts.last():
             # Hack to approximate a context manager
             await self.env.__aexit__(None, None, None)
 
-        msgs = [] if ts.last() else ts.observation.messages
+        msgs = [] if ts.last() else open_responses.to_chat_messages(ts.observation, strict=True)
         return base_text_env.BaseTextEnvStepOutput(
             observations=msgs,
             reward=ts.reward or 0.0,

diff --git a/examples/05_tinker_train.py b/examples/05_tinker_train.py
@@ -49,8 +49,10 @@
 import ares
 from ares import containers
 from ares import llms
+from ares.llms import open_responses
 import chz
 import frozendict
+from linguafranca import types as lft
 import numpy as np
 import tinker
 from tinker_cookbook import cli_utils
@@ -109,8 +111,8 @@ class TinkerCompatibleEnv(tinker_types.Env):
     """Adapter wrapping ARES environments to work with Tinker's RL training loop.
 
     Handles bidirectional conversion:
-    - ARES LLMRequest -> Tinker ModelInput (tokenized prompts)
-    - Tinker Action (text) -> ARES LLMResponse
+    - ARES Open Responses request -> Tinker ModelInput (tokenized prompts)
+    - Tinker Action (text) -> ARES InferenceResult
     - ARES TimeStep -> Tinker StepResult
 
     This enables using any ARES environment with Tinker's training infrastructure.
@@ -121,7 +123,7 @@ class TinkerCompatibleEnv(tinker_types.Env):
 
     def __init__(
         self,
-        env: ares.Environment[llms.LLMResponse, llms.LLMRequest, float, float],
+        env: ares.Environment[llms.InferenceResult, lft.OpenResponsesRequest, float, float],
         renderer: renderers.Renderer,
         convo_prefix: list[renderers.Message] | None,
         max_tokens: int,
@@ -132,14 +134,14 @@ def __init__(
         self.max_tokens = max_tokens
 
     def _get_tinker_observation(
-        self, ts: ares.TimeStep[llms.LLMRequest | None, float, float]
+        self, ts: ares.TimeStep[lft.OpenResponsesRequest | None, float, float]
     ) -> tinker_types.Observation:
         if ts.observation is None:
             return tinker.ModelInput.empty()
 
         messages = self.convo_prefix + [
             renderers.Message(role=message["role"], content=message["content"])  # type: ignore
-            for message in ts.observation.messages
+            for message in open_responses.to_chat_messages(ts.observation, strict=True)
         ]
         model_input = self.renderer.build_generation_prompt(messages)
 
@@ -149,15 +151,14 @@ def _get_tinker_observation(
 
         return model_input
 
-    def _get_ares_action(self, action: tinker_types.Action) -> llms.LLMResponse:
+    def _get_ares_action(self, action: tinker_types.Action) -> llms.InferenceResult:
         message, parse_success = self.renderer.parse_response(action)
         if not parse_success:
             _LOGGER.warning("Failed to parse response: %s", message)
 
-        return llms.LLMResponse(
-            data=[llms.TextData(content=_get_text_content(message))],
+        return llms.InferenceResult(
+            response=llms.make_response(_get_text_content(message)),
             cost=0.0,
-            usage=llms.Usage(prompt_tokens=-1, generated_tokens=-1),
         )
 
     @property