Skip to content

Commit 1b8f835

Browse files
committed
Improved prompts and using structured output pydantic models
1 parent 42b7f99 commit 1b8f835

File tree

5 files changed

+116
-65
lines changed

5 files changed

+116
-65
lines changed

src/neuralnoise/models.py

+39-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
from textwrap import dedent
33
from typing import Literal
44

5-
from pydantic import BaseModel, Field
5+
from pydantic import BaseModel, Field, field_validator
66

77

88
class VoiceSettings(BaseModel):
@@ -70,3 +70,41 @@ def render_speakers_details(self) -> str:
7070
speaker.render(speaker_id, ["name", "about"])
7171
for speaker_id, speaker in self.speakers.items()
7272
)
73+
74+
75+
class ContentSegment(BaseModel):
76+
topic: str
77+
duration: float # in minutes
78+
discussion_points: list[str]
79+
80+
81+
class ContentAnalysis(BaseModelDisplay):
82+
title: str
83+
summary: str
84+
key_points: list[str]
85+
tone: str
86+
target_audience: str
87+
potential_segments: list[ContentSegment]
88+
controversial_topics: list[str]
89+
90+
91+
class ScriptSegment(BaseModel):
92+
id: int
93+
speaker: Literal["speaker1", "speaker2"]
94+
content: str
95+
type: Literal["narrative", "reaction", "question"]
96+
blank_duration: float | None = Field(
97+
None, description="Time in seconds for silence after speaking"
98+
)
99+
100+
@field_validator("blank_duration")
101+
def validate_blank_duration(cls, v):
102+
if v is not None and v not in (0.1, 0.2, 0.5):
103+
raise ValueError("blank_duration must be 0.1, 0.2, or 0.5 seconds")
104+
return v
105+
106+
107+
class PodcastScript(BaseModel):
108+
section_id: int
109+
section_title: str
110+
segments: list[ScriptSegment]
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,35 @@
11
<content-analyzer-agent>
2-
<context>
3-
- You are a content analyst for podcasts. Analyze the provided content and extract key information to create an engaging script.
4-
- Remember to create a final section with conclusions and podcast wrap-up.
5-
- Create sections that cover the main points and arguments of the content.
6-
- The user will write the content in the XML tag named <![CDATA[ <content> ... </content> ]]>
7-
- If there are multiple content documents, you'll receive them enclosed individually in an XML tag named <![CDATA[ <document> ... </document> ]]>
8-
</context>
2+
<purpose>
3+
You are a content analyst for podcasts. Analyze the provided content and extract key information
4+
to create an engaging script.
5+
</purpose>
6+
<instructions>
7+
<instruction>Remember to create a final section with conclusions and podcast wrap-up.</instruction>
8+
<instruction>Create sections that cover the main points and arguments of the content.</instruction>
9+
<instruction>The user will write the content in the XML tag named <![CDATA[ <content> ... </content> ]]></instruction>
10+
<instruction>If there are multiple content documents, you'll receive them enclosed individually
11+
in an XML tag named <![CDATA[ <document> ... </document> ]]></instruction>
12+
</instructions>
913
<output-format>
1014
Provide your analysis in JSON format that conforms to the following TypeScript interface:
1115

12-
<![CDATA[
16+
<![CDATA[
1317
interface ContentAnalysis {
1418
title: string;
1519
summary: string;
1620
keyPoints: string[];
1721
tone: string;
1822
targetAudience: string;
19-
suggestedDuration: number;
2023
potentialSegments: {
2124
topic: string;
2225
duration: number;
2326
discussionPoints: string[];
2427
}[];
2528
controversialTopics: string[];
26-
expertOpinions: {
27-
expert: string;
28-
opinion: string;
29-
}[];
3029
}
3130
]]>
3231
</output-format>
3332
<language>
3433
${language}
3534
</language>
36-
</content-analyzer-agent>
35+
</content-analyzer-agent>

src/neuralnoise/prompts/editor.system.xml

+19-16
Original file line numberDiff line numberDiff line change
@@ -4,29 +4,32 @@
44
coherence. You ask the ScriptGeneratorAgent to generate a new script based on your suggestions.
55
</context>
66

7-
<guidelines>
8-
- Evaluate structure, depth, transitions, and dialogue naturalness
9-
- Limit iterations to 2 per generated section
10-
- Ensure natural conversation flow:
11-
- Avoid formal introductions/conclusions for sections
12-
- Encourage quick interactions and questions between speakers
13-
- Make sure that speaker1 talks more than speaker2
14-
- Check that emojis are not used
15-
- Ask the ScriptGeneratorAgent to generate a few more segments with reactions or questions if
16-
needed.
17-
</guidelines>
7+
<instructions>
8+
<instruction>Evaluate structure, depth, transitions, and dialogue naturalness</instruction>
9+
<instruction>Limit iterations to 2 per generated section</instruction>
10+
<instruction>Ensure natural conversation flow</instruction>
11+
<instruction>Avoid formal introductions/conclusions for sections</instruction>
12+
<instruction>Encourage quick interactions and questions between speakers</instruction>
13+
<instruction>Make sure that speaker1 talks more than speaker2</instruction>
14+
<instruction>Check that emojis are not used</instruction>
15+
<instruction>Content flow and engagement: make sure to not talk about the last topic in the the
16+
introductions. Engage the user introducing the topics slowly</instruction>
17+
<instruction>Ask the ScriptGeneratorAgent to generate a few more segments with reactions or
18+
questions if
19+
needed.</instruction>
20+
</instructions>
1821

1922
<output-format>
2023
Provide concise editing suggestions.
2124
Alternatively, if the script is approved, conclude with 'EDITOR-OK'.
2225
</output-format>
2326

2427
<important-notes>
25-
- Only the EditorAgent can write "EDITOR-OK"
26-
- Focus on the latest script version from the ScriptGeneratorAgent
27-
- If you provide editing suggestions, the ScriptGeneratorAgent will generate a new script based
28-
on your suggestions. Don't say EDITOR-OK in this case.
29-
- PlannerAgent proceeds to the next section after "EDITOR-OK"
28+
<important> Only the EditorAgent can write "EDITOR-OK"</important>
29+
<important>Focus on the latest script version from the ScriptGeneratorAgent</important>
30+
<important>If you provide editing suggestions, the ScriptGeneratorAgent will generate a new
31+
script based on your suggestions. Don't say EDITOR-OK in this case.</important>
32+
<important>PlannerAgent proceeds to the next section after "EDITOR-OK"</important>
3033
</important-notes>
3134
<language>
3235
${language}

src/neuralnoise/prompts/script_generation.system.xml

+28-25
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
speaker: "speaker1" | "speaker2";
2020
content: string;
2121
type: "narrative" | "reaction" | "question";
22-
blank_duration?: number; // Time in seconds (0.1, 0.2, 0.3, or 0.5) for silence after speaking
22+
blank_duration?: number; // Time in seconds (0.1, 0.2, or 0.5) for silence after speaking
2323
}>;
2424
}
2525
]]>
@@ -33,21 +33,24 @@
3333
Supporting role. Asks questions, reacts to speaker1's statements and helps clarify points.
3434
</speaker2>
3535
</speaker-roles>
36-
<guidelines>
37-
- Follow the PlannerAgent's instructions for each section.
38-
- Use colloquial language and occasional filler words for natural dialogue.
39-
- Start each section with a natural transition from the previous one.
40-
- Don't say "let's start" or "let's continue". Avoid saying things like "On the next section, we
41-
will..."
42-
- Avoid formal introductions or conclusions for sections/segments.
43-
- Allow speakers to ask and answer questions naturally.
44-
- Create ${min_segments}-${max_segments} segments per section, with a mix of short and long segments.
45-
- Include some very short segments (1-2 words) for quick interactions and expressing emotions or
46-
reactions. For example: "Yeah.", "Right?", "So cool."
47-
- Use pauses (blank_duration) where appropriate.
48-
- Don't use emojis in the script.
49-
- Don't add any metadata about emotions or laughter in the script.
50-
</guidelines>
36+
<instructions>
37+
<instruction>Follow the PlannerAgent's instructions for each section.</instruction>
38+
<instruction>Use colloquial language and occasional filler words for natural dialogue.</instruction>
39+
<instruction>Start each section with a natural transition from the previous one.</instruction>
40+
<instruction>Don't say "let's start" or "let's continue". Avoid saying things like "On the next
41+
section, we will..."</instruction>
42+
<instruction>Avoid formal introductions or conclusions for sections/segments.</instruction>
43+
<instruction>Allow speakers to ask and answer questions naturally.</instruction>
44+
<instruction>Create ${min_segments}-${max_segments} segments per section, with a mix of short
45+
and
46+
long segments.</instruction>
47+
<instruction>Include some very short segments (1-2 words) for quick interactions and expressing
48+
emotions or reactions. For example: "Yeah.", "Right?", "So cool."</instruction>
49+
<instruction>Use pauses (blank_duration) where appropriate.</instruction>
50+
<instruction>Introduce the topics slowly, don't talk about the last topic in the introductions.</instruction>
51+
<instruction>Don't use emojis in the script.</instruction>
52+
<instruction>Don't add any metadata about emotions or laughter in the script.</instruction>
53+
</instructions>
5154
<conversation-example>
5255
<![CDATA[
5356
{
@@ -64,7 +67,7 @@
6467
"speaker": "speaker2",
6568
"content": "Hold on tight, because...",
6669
"type": "reaction",
67-
"blank_duration": 0.3
70+
"blank_duration": 0.2
6871
},
6972
{
7073
"speaker": "speaker1",
@@ -76,19 +79,19 @@
7679
"speaker": "speaker2",
7780
"content": "That's right.",
7881
"type": "reaction",
79-
"blank_duration": 0.3
82+
"blank_duration": 0.2
8083
},
8184
{
8285
"speaker": "speaker1",
8386
"content": "And it's awesome. There are a lot of cheating accusations, so this is going to be fun.",
8487
"type": "narrative",
85-
"blank_duration": 0.3
88+
"blank_duration": 0.2
8689
},
8790
{
8891
"speaker": "speaker2",
8992
"content": "It always is, isn't it?",
9093
"type": "question",
91-
"blank_duration": 0.3
94+
"blank_duration": 0.2
9295
},
9396
{
9497
"speaker": "speaker1",
@@ -112,19 +115,19 @@
112115
"speaker": "speaker2",
113116
"content": "Exactly, it's part of the fun. We're trying to figure out what's going on in real-time.",
114117
"type": "narrative",
115-
"blank_duration": 0.3
118+
"blank_duration": 0.2
116119
},
117120
{
118121
"speaker": "speaker1",
119122
"content": "So the first line that caught my attention was... It seems to say \"look, they taught it from the CBA\". Well, I'm not sure what all that means, but...",
120123
"type": "narrative",
121-
"blank_duration": 0.3
124+
"blank_duration": 0.2
122125
},
123126
{
124127
"speaker": "speaker2",
125128
"content": "Yeah, CBA could be some kind of game or maybe a particular version of the game, a map, or a custom game mode. It gives us a bit more context.",
126129
"type": "narrative",
127-
"blank_duration": 0.3
130+
"blank_duration": 0.2
128131
},
129132
{
130133
"speaker": "speaker1",
@@ -136,7 +139,7 @@
136139
"speaker": "speaker2",
137140
"content": "Totally. We need more information to be our guide in this chat.",
138141
"type": "narrative",
139-
"blank_duration": 0.3
142+
"blank_duration": 0.2
140143
},
141144
{
142145
"speaker": "speaker1",
@@ -154,7 +157,7 @@
154157
"speaker": "speaker1",
155158
"content": "Yeah.",
156159
"type": "reaction",
157-
"blank_duration": 0.3
160+
"blank_duration": 0.2
158161
},
159162
{
160163
"speaker": "speaker2",

src/neuralnoise/studio/agents.py

+17-9
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from pydub.effects import normalize
1717
from tqdm.auto import tqdm
1818

19-
from neuralnoise.models import StudioConfig
19+
from neuralnoise.models import ContentAnalysis, StudioConfig, PodcastScript
2020
from neuralnoise.studio.hooks import (
2121
optimize_chat_history_hook,
2222
save_last_json_message_hook,
@@ -43,12 +43,6 @@ def __init__(self, work_dir: str | Path, config: StudioConfig, max_round: int =
4343
"api_key": os.environ["OPENAI_API_KEY"],
4444
}
4545

46-
self.llm_json_mode_config = {
47-
"response_format": {"type": "json_object"},
48-
"model": "gpt-4o",
49-
"api_key": os.environ["OPENAI_API_KEY"],
50-
}
51-
5246
self.agents: list[Agent] = []
5347
for attr in dir(self):
5448
if hasattr(getattr(self, attr), "is_agent"):
@@ -74,7 +68,14 @@ def content_analyzer_agent(self) -> AssistantAgent:
7468
system_message=self.load_prompt(
7569
"content_analyzer.system", language=self.language
7670
),
77-
llm_config={"config_list": [self.llm_json_mode_config]},
71+
llm_config={
72+
"config_list": [
73+
{
74+
**self.llm_default_config,
75+
"response_format": ContentAnalysis,
76+
}
77+
]
78+
},
7879
)
7980
agent.register_hook(
8081
hookable_method="process_message_before_send",
@@ -103,7 +104,14 @@ def script_generator_agent(self) -> AssistantAgent:
103104
min_segments=str(self.config.show.min_segments),
104105
max_segments=str(self.config.show.max_segments),
105106
),
106-
llm_config={"config_list": [self.llm_json_mode_config]},
107+
llm_config={
108+
"config_list": [
109+
{
110+
**self.llm_default_config,
111+
"response_format": PodcastScript,
112+
}
113+
]
114+
},
107115
)
108116
agent.register_hook(
109117
hookable_method="process_message_before_send",

0 commit comments

Comments
 (0)