Improved prompts and using structured output pydantic models

leopiney · leopiney · commit 1b8f83570899 · 2024-12-12T23:27:17.000-03:00
diff --git a/src/neuralnoise/models.py b/src/neuralnoise/models.py
@@ -2,7 +2,7 @@
 from textwrap import dedent
 from typing import Literal
 
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, field_validator
 
 
 class VoiceSettings(BaseModel):
@@ -70,3 +70,41 @@ def render_speakers_details(self) -> str:
             speaker.render(speaker_id, ["name", "about"])
             for speaker_id, speaker in self.speakers.items()
         )
+
+
+class ContentSegment(BaseModel):
+    topic: str
+    duration: float  # in minutes
+    discussion_points: list[str]
+
+
+class ContentAnalysis(BaseModelDisplay):
+    title: str
+    summary: str
+    key_points: list[str]
+    tone: str
+    target_audience: str
+    potential_segments: list[ContentSegment]
+    controversial_topics: list[str]
+
+
+class ScriptSegment(BaseModel):
+    id: int
+    speaker: Literal["speaker1", "speaker2"]
+    content: str
+    type: Literal["narrative", "reaction", "question"]
+    blank_duration: float | None = Field(
+        None, description="Time in seconds for silence after speaking"
+    )
+
+    @field_validator("blank_duration")
+    def validate_blank_duration(cls, v):
+        if v is not None and v not in (0.1, 0.2, 0.5):
+            raise ValueError("blank_duration must be 0.1, 0.2, or 0.5 seconds")
+        return v
+
+
+class PodcastScript(BaseModel):
+    section_id: int
+    section_title: str
+    segments: list[ScriptSegment]
diff --git a/src/neuralnoise/prompts/content_analyzer.system.xml b/src/neuralnoise/prompts/content_analyzer.system.xml
@@ -1,36 +1,35 @@
 <content-analyzer-agent>
-  <context>
-    - You are a content analyst for podcasts. Analyze the provided content and extract key information to create an engaging script.
-    - Remember to create a final section with conclusions and podcast wrap-up.
-    - Create sections that cover the main points and arguments of the content.
-    - The user will write the content in the XML tag named <![CDATA[ <content> ... </content> ]]>
-    - If there are multiple content documents, you'll receive them enclosed individually in an XML tag named <![CDATA[ <document> ... </document> ]]>
-  </context>
+  <purpose>
+    You are a content analyst for podcasts. Analyze the provided content and extract key information
+    to create an engaging script.
+  </purpose>
+  <instructions>
+    <instruction>Remember to create a final section with conclusions and podcast wrap-up.</instruction>
+    <instruction>Create sections that cover the main points and arguments of the content.</instruction>
+    <instruction>The user will write the content in the XML tag named <![CDATA[ <content> ... </content> ]]></instruction>
+    <instruction>If there are multiple content documents, you'll receive them enclosed individually
+      in an XML tag named <![CDATA[ <document> ... </document> ]]></instruction>
+  </instructions>
   <output-format>
     Provide your analysis in JSON format that conforms to the following TypeScript interface:
 
-    <![CDATA[
+  <![CDATA[
       interface ContentAnalysis {
         title: string;
         summary: string;
         keyPoints: string[];
         tone: string;
         targetAudience: string;
-        suggestedDuration: number;
         potentialSegments: {
           topic: string;
           duration: number;
           discussionPoints: string[];
         }[];
         controversialTopics: string[];
-        expertOpinions: {
-          expert: string;
-          opinion: string;
-        }[];
       }
     ]]>
   </output-format>
   <language>
     ${language}
   </language>
-</content-analyzer-agent>
+</content-analyzer-agent>
diff --git a/src/neuralnoise/prompts/editor.system.xml b/src/neuralnoise/prompts/editor.system.xml
@@ -4,29 +4,32 @@
     coherence. You ask the ScriptGeneratorAgent to generate a new script based on your suggestions.
   </context>
 
-  <guidelines>
-    - Evaluate structure, depth, transitions, and dialogue naturalness
-    - Limit iterations to 2 per generated section
-    - Ensure natural conversation flow:
-    - Avoid formal introductions/conclusions for sections
-    - Encourage quick interactions and questions between speakers
-    - Make sure that speaker1 talks more than speaker2
-    - Check that emojis are not used
-    - Ask the ScriptGeneratorAgent to generate a few more segments with reactions or questions if
-    needed.
-  </guidelines>
+  <instructions>
+    <instruction>Evaluate structure, depth, transitions, and dialogue naturalness</instruction>
+    <instruction>Limit iterations to 2 per generated section</instruction>
+    <instruction>Ensure natural conversation flow</instruction>
+    <instruction>Avoid formal introductions/conclusions for sections</instruction>
+    <instruction>Encourage quick interactions and questions between speakers</instruction>
+    <instruction>Make sure that speaker1 talks more than speaker2</instruction>
+    <instruction>Check that emojis are not used</instruction>
+    <instruction>Content flow and engagement: make sure to not talk about the last topic in the the
+      introductions. Engage the user introducing the topics slowly</instruction>
+    <instruction>Ask the ScriptGeneratorAgent to generate a few more segments with reactions or
+      questions if
+      needed.</instruction>
+  </instructions>
 
   <output-format>
     Provide concise editing suggestions.
     Alternatively, if the script is approved, conclude with 'EDITOR-OK'.
   </output-format>
 
   <important-notes>
-    - Only the EditorAgent can write "EDITOR-OK"
-    - Focus on the latest script version from the ScriptGeneratorAgent
-    - If you provide editing suggestions, the ScriptGeneratorAgent will generate a new script based
-    on your suggestions. Don't say EDITOR-OK in this case.
-    - PlannerAgent proceeds to the next section after "EDITOR-OK"
+    <important> Only the EditorAgent can write "EDITOR-OK"</important>
+    <important>Focus on the latest script version from the ScriptGeneratorAgent</important>
+    <important>If you provide editing suggestions, the ScriptGeneratorAgent will generate a new
+      script based on your suggestions. Don't say EDITOR-OK in this case.</important>
+    <important>PlannerAgent proceeds to the next section after "EDITOR-OK"</important>
   </important-notes>
   <language>
     ${language}
diff --git a/src/neuralnoise/prompts/script_generation.system.xml b/src/neuralnoise/prompts/script_generation.system.xml
@@ -19,7 +19,7 @@
             speaker: "speaker1" | "speaker2";
             content: string;
             type: "narrative" | "reaction" | "question";
-            blank_duration?: number; // Time in seconds (0.1, 0.2, 0.3, or 0.5) for silence after speaking
+            blank_duration?: number; // Time in seconds (0.1, 0.2, or 0.5) for silence after speaking
           }>;
         }
       ]]>
@@ -33,21 +33,24 @@
       Supporting role. Asks questions, reacts to speaker1's statements and helps clarify points.
     </speaker2>
   </speaker-roles>
-  <guidelines>
-    - Follow the PlannerAgent's instructions for each section.
-    - Use colloquial language and occasional filler words for natural dialogue.
-    - Start each section with a natural transition from the previous one.
-    - Don't say "let's start" or "let's continue". Avoid saying things like "On the next section, we
-    will..."
-    - Avoid formal introductions or conclusions for sections/segments.
-    - Allow speakers to ask and answer questions naturally.
-    - Create ${min_segments}-${max_segments} segments per section, with a mix of short and long segments.
-    - Include some very short segments (1-2 words) for quick interactions and expressing emotions or
-    reactions. For example: "Yeah.", "Right?", "So cool."
-    - Use pauses (blank_duration) where appropriate.
-    - Don't use emojis in the script.
-    - Don't add any metadata about emotions or laughter in the script.
-  </guidelines>
+  <instructions>
+    <instruction>Follow the PlannerAgent's instructions for each section.</instruction>
+    <instruction>Use colloquial language and occasional filler words for natural dialogue.</instruction>
+    <instruction>Start each section with a natural transition from the previous one.</instruction>
+    <instruction>Don't say "let's start" or "let's continue". Avoid saying things like "On the next
+      section, we will..."</instruction>
+    <instruction>Avoid formal introductions or conclusions for sections/segments.</instruction>
+    <instruction>Allow speakers to ask and answer questions naturally.</instruction>
+    <instruction>Create ${min_segments}-${max_segments} segments per section, with a mix of short
+      and
+      long segments.</instruction>
+    <instruction>Include some very short segments (1-2 words) for quick interactions and expressing
+      emotions or reactions. For example: "Yeah.", "Right?", "So cool."</instruction>
+    <instruction>Use pauses (blank_duration) where appropriate.</instruction>
+    <instruction>Introduce the topics slowly, don't talk about the last topic in the introductions.</instruction>
+    <instruction>Don't use emojis in the script.</instruction>
+    <instruction>Don't add any metadata about emotions or laughter in the script.</instruction>
+  </instructions>
   <conversation-example>
     <![CDATA[ 
       {
@@ -64,7 +67,7 @@
             "speaker": "speaker2",
             "content": "Hold on tight, because...",
             "type": "reaction",
-            "blank_duration": 0.3
+            "blank_duration": 0.2
           },
           {
             "speaker": "speaker1",
@@ -76,19 +79,19 @@
             "speaker": "speaker2",
             "content": "That's right.",
             "type": "reaction",
-            "blank_duration": 0.3
+            "blank_duration": 0.2
           },
           {
             "speaker": "speaker1",
             "content": "And it's awesome. There are a lot of cheating accusations, so this is going to be fun.",
             "type": "narrative",
-            "blank_duration": 0.3
+            "blank_duration": 0.2
           },
           {
             "speaker": "speaker2",
             "content": "It always is, isn't it?",
             "type": "question",
-            "blank_duration": 0.3
+            "blank_duration": 0.2
           },
           {
             "speaker": "speaker1",
@@ -112,19 +115,19 @@
             "speaker": "speaker2",
             "content": "Exactly, it's part of the fun. We're trying to figure out what's going on in real-time.",
             "type": "narrative",
-            "blank_duration": 0.3
+            "blank_duration": 0.2
           },
           {
             "speaker": "speaker1",
             "content": "So the first line that caught my attention was... It seems to say \"look, they taught it from the CBA\". Well, I'm not sure what all that means, but...",
             "type": "narrative",
-            "blank_duration": 0.3
+            "blank_duration": 0.2
           },
           {
             "speaker": "speaker2",
             "content": "Yeah, CBA could be some kind of game or maybe a particular version of the game, a map, or a custom game mode. It gives us a bit more context.",
             "type": "narrative",
-            "blank_duration": 0.3
+            "blank_duration": 0.2
           },
           {
             "speaker": "speaker1",
@@ -136,7 +139,7 @@
             "speaker": "speaker2",
             "content": "Totally. We need more information to be our guide in this chat.",
             "type": "narrative",
-            "blank_duration": 0.3
+            "blank_duration": 0.2
           },
           {
             "speaker": "speaker1",
@@ -154,7 +157,7 @@
             "speaker": "speaker1",
             "content": "Yeah.",
             "type": "reaction",
-            "blank_duration": 0.3
+            "blank_duration": 0.2
           },
           {
             "speaker": "speaker2",
diff --git a/src/neuralnoise/studio/agents.py b/src/neuralnoise/studio/agents.py
@@ -16,7 +16,7 @@
 from pydub.effects import normalize
 from tqdm.auto import tqdm
 
-from neuralnoise.models import StudioConfig
+from neuralnoise.models import ContentAnalysis, StudioConfig, PodcastScript
 from neuralnoise.studio.hooks import (
     optimize_chat_history_hook,
     save_last_json_message_hook,
@@ -43,12 +43,6 @@ def __init__(self, work_dir: str | Path, config: StudioConfig, max_round: int =
             "api_key": os.environ["OPENAI_API_KEY"],
         }
 
-        self.llm_json_mode_config = {
-            "response_format": {"type": "json_object"},
-            "model": "gpt-4o",
-            "api_key": os.environ["OPENAI_API_KEY"],
-        }
-
         self.agents: list[Agent] = []
         for attr in dir(self):
             if hasattr(getattr(self, attr), "is_agent"):
@@ -74,7 +68,14 @@ def content_analyzer_agent(self) -> AssistantAgent:
             system_message=self.load_prompt(
                 "content_analyzer.system", language=self.language
             ),
-            llm_config={"config_list": [self.llm_json_mode_config]},
+            llm_config={
+                "config_list": [
+                    {
+                        **self.llm_default_config,
+                        "response_format": ContentAnalysis,
+                    }
+                ]
+            },
         )
         agent.register_hook(
             hookable_method="process_message_before_send",
@@ -103,7 +104,14 @@ def script_generator_agent(self) -> AssistantAgent:
                 min_segments=str(self.config.show.min_segments),
                 max_segments=str(self.config.show.max_segments),
             ),
-            llm_config={"config_list": [self.llm_json_mode_config]},
+            llm_config={
+                "config_list": [
+                    {
+                        **self.llm_default_config,
+                        "response_format": PodcastScript,
+                    }
+                ]
+            },
         )
         agent.register_hook(
             hookable_method="process_message_before_send",