ogx-ai
diff --git a/‎demos/06_openai_compatibility/04_responses_max_output_tokens.py‎
Lines changed: 93 additions & 0 deletions b/‎demos/06_openai_compatibility/04_responses_max_output_tokens.py‎
Lines changed: 93 additions & 0 deletions
diff --git a/‎demos/06_openai_compatibility/05_responses_top_p.py‎
Lines changed: 102 additions & 0 deletions b/‎demos/06_openai_compatibility/05_responses_top_p.py‎
Lines changed: 102 additions & 0 deletions
diff --git a/‎demos/06_openai_compatibility/06_responses_truncation.py‎
Lines changed: 102 additions & 0 deletions b/‎demos/06_openai_compatibility/06_responses_truncation.py‎
Lines changed: 102 additions & 0 deletions
@@ -0,0 +1,93 @@
+"""
+Demo: Responses API - max_output_tokens
+
+Description:
+This demo shows how to use the max_output_tokens parameter with the
+OpenAI Responses API against a Llama Stack server. The parameter limits
+the number of tokens the model can generate in its response.
+
+Learning Objectives:
+- Control response length with max_output_tokens
+- Observe how the model truncates output when the limit is reached
+- Check the response status for length-related truncation
+"""
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from __future__ import annotations
+
+import os
+import sys
+
+import fire
+from openai import OpenAI
+from termcolor import colored
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+from shared.utils import resolve_openai_model
+
+try:
+    from dotenv import load_dotenv
+except ImportError:  # pragma: no cover - optional dependency
+    load_dotenv = None
+
+
+def _maybe_load_dotenv() -> None:
+    if load_dotenv is not None:
+        load_dotenv()
+
+
+def main(
+    host: str,
+    port: int,
+    model_id: str | None = None,
+    scheme: str = "http",
+) -> None:
+    _maybe_load_dotenv()
+
+    if scheme not in {"http", "https"}:
+        raise ValueError("scheme must be 'http' or 'https'")
+    if host not in {"localhost", "127.0.0.1", "::1"} and scheme != "https":
+        print(colored("Warning: using HTTP for a non-local host. Consider --scheme https.", "yellow"))
+
+    client = OpenAI(
+        base_url=f"{scheme}://{host}:{port}/v1",
+        api_key=os.getenv("LLAMA_STACK_API_KEY", "fake"),
+    )
+
+    resolved_model = resolve_openai_model(client, model_id)
+    if resolved_model is None:
+        return
+    print(f"Using model: {resolved_model}")
+
+    # --- Example 1: Small token limit to force truncation ---
+    print(colored("\n--- max_output_tokens=50 (short response) ---", "cyan"))
+    response = client.responses.create(
+        model=resolved_model,
+        input="Write a detailed essay about the history of artificial intelligence.",
+        max_output_tokens=50,
+    )
+    print(f"Output: {response.output_text}")
+    print(f"Status: {response.status}")
+    if hasattr(response, "incomplete_details") and response.incomplete_details:
+        print(f"Incomplete details: {response.incomplete_details}")
+
+    # --- Example 2: Larger token limit for a fuller response ---
+    print(colored("\n--- max_output_tokens=200 (longer response) ---", "cyan"))
+    response = client.responses.create(
+        model=resolved_model,
+        input="Write a detailed essay about the history of artificial intelligence.",
+        max_output_tokens=200,
+    )
+    print(f"Output: {response.output_text}")
+    print(f"Status: {response.status}")
+    if hasattr(response, "incomplete_details") and response.incomplete_details:
+        print(f"Incomplete details: {response.incomplete_details}")
+
+
+if __name__ == "__main__":
+    fire.Fire(main)
@@ -0,0 +1,102 @@
+"""
+Demo: Responses API - top_p (Nucleus Sampling)
+
+Description:
+This demo shows how to use the top_p parameter with the OpenAI Responses API
+against a Llama Stack server. The top_p parameter controls nucleus sampling,
+where the model considers only tokens whose cumulative probability mass
+reaches the specified threshold.
+
+Learning Objectives:
+- Use top_p to control sampling diversity
+- Compare outputs with different top_p values
+- Understand the relationship between top_p and response randomness
+"""
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from __future__ import annotations
+
+import os
+import sys
+
+import fire
+from openai import OpenAI
+from termcolor import colored
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+from shared.utils import resolve_openai_model
+
+try:
+    from dotenv import load_dotenv
+except ImportError:  # pragma: no cover - optional dependency
+    load_dotenv = None
+
+
+def _maybe_load_dotenv() -> None:
+    if load_dotenv is not None:
+        load_dotenv()
+
+
+def main(
+    host: str,
+    port: int,
+    model_id: str | None = None,
+    scheme: str = "http",
+) -> None:
+    _maybe_load_dotenv()
+
+    if scheme not in {"http", "https"}:
+        raise ValueError("scheme must be 'http' or 'https'")
+    if host not in {"localhost", "127.0.0.1", "::1"} and scheme != "https":
+        print(colored("Warning: using HTTP for a non-local host. Consider --scheme https.", "yellow"))
+
+    client = OpenAI(
+        base_url=f"{scheme}://{host}:{port}/v1",
+        api_key=os.getenv("LLAMA_STACK_API_KEY", "fake"),
+    )
+
+    resolved_model = resolve_openai_model(client, model_id)
+    if resolved_model is None:
+        return
+    print(f"Using model: {resolved_model}")
+
+    prompt = "Suggest a creative name for a new AI startup."
+
+    # --- Example 1: Low top_p (more focused/deterministic) ---
+    print(colored("\n--- top_p=0.1 (focused sampling) ---", "cyan"))
+    response = client.responses.create(
+        model=resolved_model,
+        input=prompt,
+        top_p=0.1,
+        temperature=1.0,
+    )
+    print(f"Output: {response.output_text}")
+
+    # --- Example 2: Medium top_p ---
+    print(colored("\n--- top_p=0.5 (moderate sampling) ---", "cyan"))
+    response = client.responses.create(
+        model=resolved_model,
+        input=prompt,
+        top_p=0.5,
+        temperature=1.0,
+    )
+    print(f"Output: {response.output_text}")
+
+    # --- Example 3: High top_p (more diverse/creative) ---
+    print(colored("\n--- top_p=0.95 (diverse sampling) ---", "cyan"))
+    response = client.responses.create(
+        model=resolved_model,
+        input=prompt,
+        top_p=0.95,
+        temperature=1.0,
+    )
+    print(f"Output: {response.output_text}")
+
+
+if __name__ == "__main__":
+    fire.Fire(main)
@@ -0,0 +1,102 @@
+"""
+Demo: Responses API - truncation
+
+Description:
+This demo shows how to use the truncation parameter with the OpenAI Responses
+API against a Llama Stack server. The truncation parameter controls how the
+model handles context that exceeds the context window, allowing automatic
+truncation of older messages.
+
+Learning Objectives:
+- Use the truncation parameter to handle long contexts
+- Understand the "auto" vs "disabled" truncation strategies
+- See how truncation affects multi-turn conversations
+"""
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+from __future__ import annotations
+
+import os
+import sys
+
+import fire
+from openai import OpenAI
+from termcolor import colored
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+from shared.utils import resolve_openai_model
+
+try:
+    from dotenv import load_dotenv
+except ImportError:  # pragma: no cover - optional dependency
+    load_dotenv = None
+
+
+def _maybe_load_dotenv() -> None:
+    if load_dotenv is not None:
+        load_dotenv()
+
+
+def main(
+    host: str,
+    port: int,
+    model_id: str | None = None,
+    scheme: str = "http",
+) -> None:
+    _maybe_load_dotenv()
+
+    if scheme not in {"http", "https"}:
+        raise ValueError("scheme must be 'http' or 'https'")
+    if host not in {"localhost", "127.0.0.1", "::1"} and scheme != "https":
+        print(colored("Warning: using HTTP for a non-local host. Consider --scheme https.", "yellow"))
+
+    client = OpenAI(
+        base_url=f"{scheme}://{host}:{port}/v1",
+        api_key=os.getenv("LLAMA_STACK_API_KEY", "fake"),
+    )
+
+    resolved_model = resolve_openai_model(client, model_id)
+    if resolved_model is None:
+        return
+    print(f"Using model: {resolved_model}")
+
+    # --- Example 1: Auto truncation (truncate older messages if context is too long) ---
+    # Note: truncation='auto' is not yet supported by Llama Stack.
+    print(colored("\n--- truncation='auto' ---", "cyan"))
+    try:
+        response = client.responses.create(
+            model=resolved_model,
+            input=[
+                {"role": "user", "content": "My name is Alice."},
+                {"role": "assistant", "content": "Hello Alice! How can I help you today?"},
+                {"role": "user", "content": "What is 2 + 2?"},
+                {"role": "assistant", "content": "2 + 2 equals 4."},
+                {"role": "user", "content": "What is my name?"},
+            ],
+            truncation="auto",
+        )
+        print(f"Output: {response.output_text}")
+    except Exception as e:
+        print(colored(f"Not supported: {e}", "yellow"))
+
+    # --- Example 2: Disabled truncation (fail if context exceeds window) ---
+    print(colored("\n--- truncation='disabled' ---", "cyan"))
+    response = client.responses.create(
+        model=resolved_model,
+        input=[
+            {"role": "user", "content": "My name is Bob."},
+            {"role": "assistant", "content": "Nice to meet you, Bob!"},
+            {"role": "user", "content": "What is my name?"},
+        ],
+        truncation="disabled",
+    )
+    print(f"Output: {response.output_text}")
+
+
+if __name__ == "__main__":
+    fire.Fire(main)