Skip to content

Commit 5d016f9

Browse files
authored
Merge branch 'main' into dependabot/pip/demos/a2a_llama_stack/pygments-2.20.0
2 parents 176a0a1 + b4116df commit 5d016f9

14 files changed

Lines changed: 2228 additions & 898 deletions
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
"""
2+
Demo: Responses API - max_output_tokens
3+
4+
Description:
5+
This demo shows how to use the max_output_tokens parameter with the
6+
OpenAI Responses API against a Llama Stack server. The parameter limits
7+
the number of tokens the model can generate in its response.
8+
9+
Learning Objectives:
10+
- Control response length with max_output_tokens
11+
- Observe how the model truncates output when the limit is reached
12+
- Check the response status for length-related truncation
13+
"""
14+
15+
# Copyright (c) Meta Platforms, Inc. and affiliates.
16+
# All rights reserved.
17+
#
18+
# This source code is licensed under the terms described in the LICENSE file in
19+
# the root directory of this source tree.
20+
21+
from __future__ import annotations
22+
23+
import os
24+
import sys
25+
26+
import fire
27+
from openai import OpenAI
28+
from termcolor import colored
29+
30+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
31+
from shared.utils import resolve_openai_model
32+
33+
try:
34+
from dotenv import load_dotenv
35+
except ImportError: # pragma: no cover - optional dependency
36+
load_dotenv = None
37+
38+
39+
def _maybe_load_dotenv() -> None:
40+
if load_dotenv is not None:
41+
load_dotenv()
42+
43+
44+
def main(
45+
host: str,
46+
port: int,
47+
model_id: str | None = None,
48+
scheme: str = "http",
49+
) -> None:
50+
_maybe_load_dotenv()
51+
52+
if scheme not in {"http", "https"}:
53+
raise ValueError("scheme must be 'http' or 'https'")
54+
if host not in {"localhost", "127.0.0.1", "::1"} and scheme != "https":
55+
print(colored("Warning: using HTTP for a non-local host. Consider --scheme https.", "yellow"))
56+
57+
client = OpenAI(
58+
base_url=f"{scheme}://{host}:{port}/v1",
59+
api_key=os.getenv("LLAMA_STACK_API_KEY", "fake"),
60+
)
61+
62+
resolved_model = resolve_openai_model(client, model_id)
63+
if resolved_model is None:
64+
return
65+
print(f"Using model: {resolved_model}")
66+
67+
# --- Example 1: Small token limit to force truncation ---
68+
print(colored("\n--- max_output_tokens=50 (short response) ---", "cyan"))
69+
response = client.responses.create(
70+
model=resolved_model,
71+
input="Write a detailed essay about the history of artificial intelligence.",
72+
max_output_tokens=50,
73+
)
74+
print(f"Output: {response.output_text}")
75+
print(f"Status: {response.status}")
76+
if hasattr(response, "incomplete_details") and response.incomplete_details:
77+
print(f"Incomplete details: {response.incomplete_details}")
78+
79+
# --- Example 2: Larger token limit for a fuller response ---
80+
print(colored("\n--- max_output_tokens=200 (longer response) ---", "cyan"))
81+
response = client.responses.create(
82+
model=resolved_model,
83+
input="Write a detailed essay about the history of artificial intelligence.",
84+
max_output_tokens=200,
85+
)
86+
print(f"Output: {response.output_text}")
87+
print(f"Status: {response.status}")
88+
if hasattr(response, "incomplete_details") and response.incomplete_details:
89+
print(f"Incomplete details: {response.incomplete_details}")
90+
91+
92+
if __name__ == "__main__":
93+
fire.Fire(main)
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
"""
2+
Demo: Responses API - top_p (Nucleus Sampling)
3+
4+
Description:
5+
This demo shows how to use the top_p parameter with the OpenAI Responses API
6+
against a Llama Stack server. The top_p parameter controls nucleus sampling,
7+
where the model considers only tokens whose cumulative probability mass
8+
reaches the specified threshold.
9+
10+
Learning Objectives:
11+
- Use top_p to control sampling diversity
12+
- Compare outputs with different top_p values
13+
- Understand the relationship between top_p and response randomness
14+
"""
15+
16+
# Copyright (c) Meta Platforms, Inc. and affiliates.
17+
# All rights reserved.
18+
#
19+
# This source code is licensed under the terms described in the LICENSE file in
20+
# the root directory of this source tree.
21+
22+
from __future__ import annotations
23+
24+
import os
25+
import sys
26+
27+
import fire
28+
from openai import OpenAI
29+
from termcolor import colored
30+
31+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
32+
from shared.utils import resolve_openai_model
33+
34+
try:
35+
from dotenv import load_dotenv
36+
except ImportError: # pragma: no cover - optional dependency
37+
load_dotenv = None
38+
39+
40+
def _maybe_load_dotenv() -> None:
41+
if load_dotenv is not None:
42+
load_dotenv()
43+
44+
45+
def main(
46+
host: str,
47+
port: int,
48+
model_id: str | None = None,
49+
scheme: str = "http",
50+
) -> None:
51+
_maybe_load_dotenv()
52+
53+
if scheme not in {"http", "https"}:
54+
raise ValueError("scheme must be 'http' or 'https'")
55+
if host not in {"localhost", "127.0.0.1", "::1"} and scheme != "https":
56+
print(colored("Warning: using HTTP for a non-local host. Consider --scheme https.", "yellow"))
57+
58+
client = OpenAI(
59+
base_url=f"{scheme}://{host}:{port}/v1",
60+
api_key=os.getenv("LLAMA_STACK_API_KEY", "fake"),
61+
)
62+
63+
resolved_model = resolve_openai_model(client, model_id)
64+
if resolved_model is None:
65+
return
66+
print(f"Using model: {resolved_model}")
67+
68+
prompt = "Suggest a creative name for a new AI startup."
69+
70+
# --- Example 1: Low top_p (more focused/deterministic) ---
71+
print(colored("\n--- top_p=0.1 (focused sampling) ---", "cyan"))
72+
response = client.responses.create(
73+
model=resolved_model,
74+
input=prompt,
75+
top_p=0.1,
76+
temperature=1.0,
77+
)
78+
print(f"Output: {response.output_text}")
79+
80+
# --- Example 2: Medium top_p ---
81+
print(colored("\n--- top_p=0.5 (moderate sampling) ---", "cyan"))
82+
response = client.responses.create(
83+
model=resolved_model,
84+
input=prompt,
85+
top_p=0.5,
86+
temperature=1.0,
87+
)
88+
print(f"Output: {response.output_text}")
89+
90+
# --- Example 3: High top_p (more diverse/creative) ---
91+
print(colored("\n--- top_p=0.95 (diverse sampling) ---", "cyan"))
92+
response = client.responses.create(
93+
model=resolved_model,
94+
input=prompt,
95+
top_p=0.95,
96+
temperature=1.0,
97+
)
98+
print(f"Output: {response.output_text}")
99+
100+
101+
if __name__ == "__main__":
102+
fire.Fire(main)
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
"""
2+
Demo: Responses API - truncation
3+
4+
Description:
5+
This demo shows how to use the truncation parameter with the OpenAI Responses
6+
API against a Llama Stack server. The truncation parameter controls how the
7+
model handles context that exceeds the context window, allowing automatic
8+
truncation of older messages.
9+
10+
Learning Objectives:
11+
- Use the truncation parameter to handle long contexts
12+
- Understand the "auto" vs "disabled" truncation strategies
13+
- See how truncation affects multi-turn conversations
14+
"""
15+
16+
# Copyright (c) Meta Platforms, Inc. and affiliates.
17+
# All rights reserved.
18+
#
19+
# This source code is licensed under the terms described in the LICENSE file in
20+
# the root directory of this source tree.
21+
22+
from __future__ import annotations
23+
24+
import os
25+
import sys
26+
27+
import fire
28+
from openai import OpenAI
29+
from termcolor import colored
30+
31+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
32+
from shared.utils import resolve_openai_model
33+
34+
try:
35+
from dotenv import load_dotenv
36+
except ImportError: # pragma: no cover - optional dependency
37+
load_dotenv = None
38+
39+
40+
def _maybe_load_dotenv() -> None:
41+
if load_dotenv is not None:
42+
load_dotenv()
43+
44+
45+
def main(
46+
host: str,
47+
port: int,
48+
model_id: str | None = None,
49+
scheme: str = "http",
50+
) -> None:
51+
_maybe_load_dotenv()
52+
53+
if scheme not in {"http", "https"}:
54+
raise ValueError("scheme must be 'http' or 'https'")
55+
if host not in {"localhost", "127.0.0.1", "::1"} and scheme != "https":
56+
print(colored("Warning: using HTTP for a non-local host. Consider --scheme https.", "yellow"))
57+
58+
client = OpenAI(
59+
base_url=f"{scheme}://{host}:{port}/v1",
60+
api_key=os.getenv("LLAMA_STACK_API_KEY", "fake"),
61+
)
62+
63+
resolved_model = resolve_openai_model(client, model_id)
64+
if resolved_model is None:
65+
return
66+
print(f"Using model: {resolved_model}")
67+
68+
# --- Example 1: Auto truncation (truncate older messages if context is too long) ---
69+
# Note: truncation='auto' is not yet supported by Llama Stack.
70+
print(colored("\n--- truncation='auto' ---", "cyan"))
71+
try:
72+
response = client.responses.create(
73+
model=resolved_model,
74+
input=[
75+
{"role": "user", "content": "My name is Alice."},
76+
{"role": "assistant", "content": "Hello Alice! How can I help you today?"},
77+
{"role": "user", "content": "What is 2 + 2?"},
78+
{"role": "assistant", "content": "2 + 2 equals 4."},
79+
{"role": "user", "content": "What is my name?"},
80+
],
81+
truncation="auto",
82+
)
83+
print(f"Output: {response.output_text}")
84+
except Exception as e:
85+
print(colored(f"Not supported: {e}", "yellow"))
86+
87+
# --- Example 2: Disabled truncation (fail if context exceeds window) ---
88+
print(colored("\n--- truncation='disabled' ---", "cyan"))
89+
response = client.responses.create(
90+
model=resolved_model,
91+
input=[
92+
{"role": "user", "content": "My name is Bob."},
93+
{"role": "assistant", "content": "Nice to meet you, Bob!"},
94+
{"role": "user", "content": "What is my name?"},
95+
],
96+
truncation="disabled",
97+
)
98+
print(f"Output: {response.output_text}")
99+
100+
101+
if __name__ == "__main__":
102+
fire.Fire(main)

0 commit comments

Comments
 (0)