onnx
diff --git a/‎.github/workflows/test_server.yml‎
Lines changed: 4 additions & 0 deletions b/‎.github/workflows/test_server.yml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎docs/lemonade/server_models.md‎
Lines changed: 60 additions & 0 deletions b/‎docs/lemonade/server_models.md‎
Lines changed: 60 additions & 0 deletions
diff --git a/‎docs/lemonade/server_spec.md‎
Lines changed: 82 additions & 0 deletions b/‎docs/lemonade/server_spec.md‎
Lines changed: 82 additions & 0 deletions
diff --git a/‎examples/lemonade/demos/chat/chat_hybrid.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/lemonade/demos/chat/chat_hybrid.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/lemonade/demos/chat/chat_start.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/lemonade/demos/chat/chat_start.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/lemonade/demos/search/search_hybrid.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/lemonade/demos/search/search_hybrid.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/lemonade/demos/search/search_start.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/lemonade/demos/search/search_start.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/lemonade/server/README.md‎
Lines changed: 11 additions & 8 deletions b/‎examples/lemonade/server/README.md‎
Lines changed: 11 additions & 8 deletions
diff --git a/‎examples/lemonade/server/mindcraft.md‎
Lines changed: 1 addition & 1 deletion b/‎examples/lemonade/server/mindcraft.md‎
Lines changed: 1 addition & 1 deletion
@@ -37,6 +37,10 @@ jobs:
           python -m pip check
           pip install -e .[llm-oga-cpu]
           lemonade-server-dev pull Qwen2.5-0.5B-Instruct-CPU
+      - name: Run server tests (unit tests)
+        shell: bash -el {0}
+        run: |
+          python test/lemonade/server_unit.py
       - name: Run server tests (network online mode)
         shell: bash -el {0}
         run: |
 
@@ -26,6 +26,10 @@ lemonade-server pull Qwen2.5-0.5B-Instruct-CPU
 <details>
 <summary>Llama-3.2-1B-Instruct-Hybrid</summary>
 
+```bash
+    lemonade-server pull Llama-3.2-1B-Instruct-Hybrid
+```
+
 | Key | Value |
 | --- | ----- |
 | Checkpoint | [amd/Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid](https://huggingface.co/amd/Llama-3.2-1B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid) |
@@ -37,6 +41,10 @@ lemonade-server pull Qwen2.5-0.5B-Instruct-CPU
 <details>
 <summary>Llama-3.2-3B-Instruct-Hybrid</summary>
 
+```bash
+    lemonade-server pull Llama-3.2-3B-Instruct-Hybrid
+```
+
 | Key | Value |
 | --- | ----- |
 | Checkpoint | [amd/Llama-3.2-3B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid](https://huggingface.co/amd/Llama-3.2-3B-Instruct-awq-g128-int4-asym-fp16-onnx-hybrid) |
@@ -48,6 +56,10 @@ lemonade-server pull Qwen2.5-0.5B-Instruct-CPU
 <details>
 <summary>Phi-3-Mini-Instruct-Hybrid</summary>
 
+```bash
+    lemonade-server pull Phi-3-Mini-Instruct-Hybrid
+```
+
 | Key | Value |
 | --- | ----- |
 | Checkpoint | [amd/Phi-3-mini-4k-instruct-awq-g128-int4-asym-fp16-onnx-hybrid](https://huggingface.co/amd/Phi-3-mini-4k-instruct-awq-g128-int4-asym-fp16-onnx-hybrid) |
@@ -59,6 +71,10 @@ lemonade-server pull Qwen2.5-0.5B-Instruct-CPU
 <details>
 <summary>Qwen-1.5-7B-Chat-Hybrid</summary>
 
+```bash
+    lemonade-server pull Qwen-1.5-7B-Chat-Hybrid
+```
+
 | Key | Value |
 | --- | ----- |
 | Checkpoint | [amd/Qwen1.5-7B-Chat-awq-g128-int4-asym-fp16-onnx-hybrid](https://huggingface.co/amd/Qwen1.5-7B-Chat-awq-g128-int4-asym-fp16-onnx-hybrid) |
@@ -70,6 +86,10 @@ lemonade-server pull Qwen2.5-0.5B-Instruct-CPU
 <details>
 <summary>DeepSeek-R1-Distill-Llama-8B-Hybrid</summary>
 
+```bash
+    lemonade-server pull DeepSeek-R1-Distill-Llama-8B-Hybrid
+```
+
 | Key | Value |
 | --- | ----- |
 | Checkpoint | [amd/DeepSeek-R1-Distill-Llama-8B-awq-asym-uint4-g128-lmhead-onnx-hybrid](https://huggingface.co/amd/DeepSeek-R1-Distill-Llama-8B-awq-asym-uint4-g128-lmhead-onnx-hybrid) |
@@ -81,6 +101,10 @@ lemonade-server pull Qwen2.5-0.5B-Instruct-CPU
 <details>
 <summary>DeepSeek-R1-Distill-Qwen-7B-Hybrid</summary>
 
+```bash
+    lemonade-server pull DeepSeek-R1-Distill-Qwen-7B-Hybrid
+```
+
 | Key | Value |
 | --- | ----- |
 | Checkpoint | [amd/DeepSeek-R1-Distill-Qwen-7B-awq-asym-uint4-g128-lmhead-onnx-hybrid](https://huggingface.co/amd/DeepSeek-R1-Distill-Qwen-7B-awq-asym-uint4-g128-lmhead-onnx-hybrid) |
@@ -92,6 +116,10 @@ lemonade-server pull Qwen2.5-0.5B-Instruct-CPU
 <details>
 <summary>Mistral-7B-v0.3-Instruct-Hybrid</summary>
 
+```bash
+    lemonade-server pull Mistral-7B-v0.3-Instruct-Hybrid
+```
+
 | Key | Value |
 | --- | ----- |
 | Checkpoint | [amd/Mistral-7B-Instruct-v0.3-awq-g128-int4-asym-fp16-onnx-hybrid](https://huggingface.co/amd/Mistral-7B-Instruct-v0.3-awq-g128-int4-asym-fp16-onnx-hybrid) |
@@ -103,6 +131,10 @@ lemonade-server pull Qwen2.5-0.5B-Instruct-CPU
 <details>
 <summary>Llama-3.1-8B-Instruct-Hybrid</summary>
 
+```bash
+    lemonade-server pull Llama-3.1-8B-Instruct-Hybrid
+```
+
 | Key | Value |
 | --- | ----- |
 | Checkpoint | [amd/Llama-3.1-8B-Instruct-awq-asym-uint4-g128-lmhead-onnx-hybrid](https://huggingface.co/amd/Llama-3.1-8B-Instruct-awq-asym-uint4-g128-lmhead-onnx-hybrid) |
@@ -117,6 +149,10 @@ lemonade-server pull Qwen2.5-0.5B-Instruct-CPU
 <details>
 <summary>Qwen2.5-0.5B-Instruct-CPU</summary>
 
+```bash
+    lemonade-server pull Qwen2.5-0.5B-Instruct-CPU
+```
+
 | Key | Value |
 | --- | ----- |
 | Checkpoint | [amd/Qwen2.5-0.5B-Instruct-quantized_int4-float16-cpu-onnx](https://huggingface.co/amd/Qwen2.5-0.5B-Instruct-quantized_int4-float16-cpu-onnx) |
@@ -128,6 +164,10 @@ lemonade-server pull Qwen2.5-0.5B-Instruct-CPU
 <details>
 <summary>Llama-3.2-1B-Instruct-CPU</summary>
 
+```bash
+    lemonade-server pull Llama-3.2-1B-Instruct-CPU
+```
+
 | Key | Value |
 | --- | ----- |
 | Checkpoint | [amd/Llama-3.2-1B-Instruct-awq-uint4-float16-cpu-onnx](https://huggingface.co/amd/Llama-3.2-1B-Instruct-awq-uint4-float16-cpu-onnx) |
@@ -139,6 +179,10 @@ lemonade-server pull Qwen2.5-0.5B-Instruct-CPU
 <details>
 <summary>Llama-3.2-3B-Instruct-CPU</summary>
 
+```bash
+    lemonade-server pull Llama-3.2-3B-Instruct-CPU
+```
+
 | Key | Value |
 | --- | ----- |
 | Checkpoint | [amd/Llama-3.2-3B-Instruct-awq-uint4-float16-cpu-onnx](https://huggingface.co/amd/Llama-3.2-3B-Instruct-awq-uint4-float16-cpu-onnx) |
@@ -150,6 +194,10 @@ lemonade-server pull Qwen2.5-0.5B-Instruct-CPU
 <details>
 <summary>Phi-3-Mini-Instruct-CPU</summary>
 
+```bash
+    lemonade-server pull Phi-3-Mini-Instruct-CPU
+```
+
 | Key | Value |
 | --- | ----- |
 | Checkpoint | [amd/Phi-3-mini-4k-instruct_int4_float16_onnx_cpu](https://huggingface.co/amd/Phi-3-mini-4k-instruct_int4_float16_onnx_cpu) |
@@ -161,6 +209,10 @@ lemonade-server pull Qwen2.5-0.5B-Instruct-CPU
 <details>
 <summary>Qwen-1.5-7B-Chat-CPU</summary>
 
+```bash
+    lemonade-server pull Qwen-1.5-7B-Chat-CPU
+```
+
 | Key | Value |
 | --- | ----- |
 | Checkpoint | [amd/Qwen1.5-7B-Chat_uint4_asym_g128_float16_onnx_cpu](https://huggingface.co/amd/Qwen1.5-7B-Chat_uint4_asym_g128_float16_onnx_cpu) |
@@ -172,6 +224,10 @@ lemonade-server pull Qwen2.5-0.5B-Instruct-CPU
 <details>
 <summary>DeepSeek-R1-Distill-Llama-8B-CPU</summary>
 
+```bash
+    lemonade-server pull DeepSeek-R1-Distill-Llama-8B-CPU
+```
+
 | Key | Value |
 | --- | ----- |
 | Checkpoint | [amd/DeepSeek-R1-Distill-Llama-8B-awq-asym-uint4-g128-lmhead-onnx-cpu](https://huggingface.co/amd/DeepSeek-R1-Distill-Llama-8B-awq-asym-uint4-g128-lmhead-onnx-cpu) |
@@ -183,6 +239,10 @@ lemonade-server pull Qwen2.5-0.5B-Instruct-CPU
 <details>
 <summary>DeepSeek-R1-Distill-Qwen-7B-CPU</summary>
 
+```bash
+    lemonade-server pull DeepSeek-R1-Distill-Qwen-7B-CPU
+```
+
 | Key | Value |
 | --- | ----- |
 | Checkpoint | [amd/DeepSeek-R1-Distill-Llama-8B-awq-asym-uint4-g128-lmhead-onnx-cpu](https://huggingface.co/amd/DeepSeek-R1-Distill-Llama-8B-awq-asym-uint4-g128-lmhead-onnx-cpu) |
 
@@ -9,6 +9,7 @@ We are also actively investigating and developing [additional endpoints](#additi
 ### OpenAI-Compatible Endpoints
 - POST `/api/v0/chat/completions` - Chat Completions (messages -> completion)
 - POST `/api/v0/completions` - Text Completions (prompt -> completion)
+- POST `api/v0/responses` - Chat Completions (prompt|messages -> event)
 - GET `/api/v0/models` - List models available locally
 
 ### Additional Endpoints
@@ -65,6 +66,7 @@ Chat Completions API. You provide a list of messages and receive a completion. T
 | `stop` | No | Up to 4 sequences where the API will stop generating further tokens. The returned text will not contain the stop sequence. Can be a string or an array of strings. | <sub>![Status](https://img.shields.io/badge/available-green)</sub> |
 | `logprobs` | No | Include log probabilities of the output tokens. If true, returns the log probability of each output token. Defaults to false. | <sub>![Status](https://img.shields.io/badge/not_available-red)</sub> |
 | `temperature` | No | What sampling temperature to use. | <sub>![Status](https://img.shields.io/badge/available-green)</sub> |
+| `tools`       | No | A list of tools the model may call. Only available when `stream` is set to `False`. | <sub>![Status](https://img.shields.io/badge/available-green)</sub> |
 | `max_tokens` | No | An upper bound for the number of tokens that can be generated for a completion. Mutually exclusive with `max_completion_tokens`. This value is now deprecated by OpenAI in favor of `max_completion_tokens` | <sub>![Status](https://img.shields.io/badge/available-green)</sub> |
 | `max_completion_tokens` | No | An upper bound for the number of tokens that can be generated for a completion. Mutually exclusive with `max_tokens`. | <sub>![Status](https://img.shields.io/badge/available-green)</sub> |
 
@@ -207,6 +209,86 @@ The following format is used for both streaming and non-streaming responses:
 }
 ```
 
+
+
+### `POST /api/v0/responses` <sub>![Status](https://img.shields.io/badge/status-partially_available-green)</sub>
+
+Responses API. You provide an input and receive a response. This API will also load the model if it is not already loaded.
+
+#### Parameters
+
+| Parameter | Required | Description | Status |
+|-----------|----------|-------------|--------|
+| `input` | Yes | A list of dictionaries or a string input for the model to respond to. | <sub>![Status](https://img.shields.io/badge/available-green)</sub> |
+| `model` | Yes | The model to use for the response. | <sub>![Status](https://img.shields.io/badge/available-green)</sub> |
+| `max_output_tokens` | No | The maximum number of output tokens to generate. | <sub>![Status](https://img.shields.io/badge/available-green)</sub> |
+| `temperature` | No | What sampling temperature to use. | <sub>![Status](https://img.shields.io/badge/available-green)</sub> |
+| `stream` | No | If true, tokens will be sent as they are generated. If false, the response will be sent as a single message once complete. Defaults to false. | <sub>![Status](https://img.shields.io/badge/available-green)</sub> |
+
+> Note: The value for `model` is either a [Lemonade Server model name](https://github.com/onnx/turnkeyml/blob/main/docs/lemonade/server_models.md), or a checkpoint that has been pre-loaded using the [load endpoint](#get-apiv0load-status).
+
+#### Streaming Events
+
+The Responses API uses semantic events for streaming. Each event is typed with a predefined schema, so you can listen for events you care about. Our initial implementation only offers support to:
+- `response.created`
+- `response.output_text.delta`
+- `response.completed`
+
+For a full list of event types, see the [API reference for streaming](https://platform.openai.com/docs/api-reference/responses-streaming).
+
+#### Example request
+
+PowerShell:
+
+```powershell
+Invoke-WebRequest -Uri "http://localhost:8000/api/v0/responses" `
+  -Method POST `
+  -Headers @{ "Content-Type" = "application/json" } `
+  -Body '{
+    "model": "Llama-3.2-1B-Instruct-Hybrid",
+    "input": "What is the population of Paris?",
+    "stream": false
+  }'
+```
+
+Bash:
+
+```bash
+curl -X POST http://localhost:8000/api/v0/responses \
+  -H "Content-Type: application/json" \
+  -d '{
+        "model": "Llama-3.2-1B-Instruct-Hybrid",
+        "input": "What is the population of Paris?",
+        "stream": false
+      }'
+```
+
+
+#### Response format
+
+For non-streaming responses:
+
+```json
+{
+  "id": "0",
+  "created_at": 1746225832.0,
+  "model": "Llama-3.2-1B-Instruct-Hybrid",
+  "object": "response",
+  "output": [{
+    "id": "0",
+    "content": [{
+      "annotations": [],
+      "text": "Paris has a population of approximately 2.2 million people in the city proper."
+    }]
+  }]
+}
+```
+
+For streaming responses, the API returns a series of events. Refer to [OpenAI streaming guide](https://platform.openai.com/docs/guides/streaming-responses?api-mode=responses) for details.
+
+
+
+
 ### `GET /api/v0/models` <sub>![Status](https://img.shields.io/badge/status-fully_available-green)</sub>
 
 Returns a list of key models available on the server in an OpenAI-compatible format. We also expanded each model object with the `checkpoint` and `recipe` fields, which may be used to load a model using the `load` endpoint.
 
@@ -1,7 +1,7 @@
 import sys
 from threading import Thread, Event
 from transformers import StoppingCriteriaList
-from lemonade.tools.serve import StopOnEvent
+from lemonade.tools.server.serve import StopOnEvent
 from lemonade.api import from_pretrained
 from lemonade.tools.ort_genai.oga import OrtGenaiStreamer
 
 
@@ -3,7 +3,7 @@
 from queue import Queue
 from time import sleep
 from transformers import StoppingCriteriaList
-from lemonade.tools.serve import StopOnEvent
+from lemonade.tools.server.serve import StopOnEvent
 
 
 class TextStreamer:
 
@@ -3,7 +3,7 @@
 from transformers import StoppingCriteriaList
 from lemonade.api import from_pretrained
 from lemonade.tools.ort_genai.oga import OrtGenaiStreamer
-from lemonade.tools.serve import StopOnEvent
+from lemonade.tools.server.serve import StopOnEvent
 
 employee_handbook = """
 1. You will work very hard every day.\n
 
@@ -3,7 +3,7 @@
 from queue import Queue
 from time import sleep
 from transformers import StoppingCriteriaList
-from lemonade.tools.serve import StopOnEvent
+from lemonade.tools.server.serve import StopOnEvent
 
 
 employee_handbook = """
 
@@ -8,14 +8,17 @@ This allows the same application to leverage local LLMs instead of relying on Op
 
 | App                 | Guide                                                                                               | Video                                                                                     |
 |---------------------|-----------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------|
-| [Open WebUI](https://github.com/open-webui/open-webui)         | [How to chat with Lemonade LLMs in Open WebUI](https://ryzenai.docs.amd.com/en/latest/llm/server_interface.html#open-webui-demo)   | [Watch Demo](https://www.youtube.com/watch?v=PXNTDZREJ_A)                                         |
-| [Continue](https://www.continue.dev/)   | [How to use Lemonade LLMs as a coding assistant in Continue](continue.md)                                          | _coming soon_                                          |
-| [Microsoft AI Toolkit](https://learn.microsoft.com/en-us/windows/ai/toolkit/)   | [Experimenting with Lemonade LLMs in VS Code using Microsoft's AI Toolkit](ai-toolkit.md)                                          | _coming soon_                                        |
-| [CodeGPT](https://codegpt.co/)   | [How to use Lemonade LLMs as a coding assistant in CodeGPT](codeGPT.md)                                          | _coming soon_                                           |
-[MindCraft](mindcraft.md) | [How to use Lemonade LLMs as a Minecraft agent](mindcraft.md) | _coming soon_                                           |
-| [wut](https://github.com/shobrook/wut)   | [Terminal assistant that uses Lemonade LLMs to explain errors](wut.md)                                          | _coming soon_                                           |
-| [AnythingLLM](https://anythingllm.com/) | [Running agents locally with Lemonade and AnythingLLM](anythingLLM.md) | _coming soon_                                          | 
-| [lm-eval](https://github.com/EleutherAI/lm-evaluation-harness)   | [A unified framework to test generative language models on a large number of different evaluation tasks.](lm-eval.md)                                          | _coming soon_           
+| [Open WebUI](https://github.com/open-webui/open-webui)         | [How to chat with Lemonade LLMs in Open WebUI](https://ryzenai.docs.amd.com/en/latest/llm/server_interface.html#open-webui-demo)   | [Watch Demo](https://www.youtube.com/watch?v=PXNTDZREJ_A)                                 |
+| [Continue](https://www.continue.dev/)   | [How to use Lemonade LLMs as a coding assistant in Continue](continue.md)                                          | [Watch Demo](https://youtu.be/bP_MZnDpbUc?si=hRhLbLEV6V_OGlUt)                            |
+| [Microsoft AI Toolkit](https://learn.microsoft.com/en-us/windows/ai/toolkit/)   | [Experimenting with Lemonade LLMs in VS Code using Microsoft's AI Toolkit](ai-toolkit.md)                                          | [Watch Demo](https://youtu.be/JecpotOZ6qo?si=WxWVQhUBCJQgE6vX)                            |
+| [GAIA](https://github.com/amd/gaia)   | [An application for running LLMs locally, includes a ChatBot, YouTube Agent, and more](https://github.com/amd/gaia?tab=readme-ov-file#getting-started-guide) | [Watch Demo](https://youtu.be/_PORHv_-atI?si=EYQjmrRQ6Zy2H0ek)                            |
+| [CodeGPT](https://codegpt.co/)   | [How to use Lemonade LLMs as a coding assistant in CodeGPT](codeGPT.md)                                          | _coming soon_                                                                             |
+| [MindCraft](mindcraft.md) | [How to use Lemonade LLMs as a Minecraft agent](mindcraft.md) | _coming soon_                                                                             |
+| [wut](https://github.com/shobrook/wut)   | [Terminal assistant that uses Lemonade LLMs to explain errors](wut.md)                                          | _coming soon_                                                                             |
+| [AnythingLLM](https://anythingllm.com/) | [Running agents locally with Lemonade and AnythingLLM](anythingLLM.md) | _coming soon_                                                                             |
+| [lm-eval](https://github.com/EleutherAI/lm-evaluation-harness)   | [A unified framework to test generative language models on a large number of different evaluation tasks.](lm-eval.md)              | _coming soon_                                                                             |
+| [PEEL](https://github.com/lemonade-apps/peel)     | [Using Local LLMs in Windows PowerShell](https://github.com/lemonade-apps/peel?tab=readme-ov-file#installation)                   | _coming soon_                                                                             |
+
 ## 📦 Looking for Installation Help?
 
 To set up Lemonade Server, check out the [Lemonade_Server_Installer.exe guide](lemonade_server_exe.md) for installation instructions and the [server spec](https://github.com/onnx/turnkeyml/blob/main/docs/lemonade/server_spec.md) to learn more about the functionality. For more information about 🍋 Lemonade SDK, see the [Lemonade SDK README](https://github.com/onnx/turnkeyml/tree/main/docs/lemonade/).
 
@@ -342,4 +342,4 @@ The following are examples of requests made by the Mindcraft software to the Lem
     TRACE:    ::1:56890 - ASGI [6] Send {'type': 'http.response.body', 'body': '<0 bytes>', 'more_body': False} 
     TRACE:    ::1:56890 - ASGI [6] Completed
     TRACE:    ::1:56890 - HTTP connection lost
-    ```
+    ```