PocketFlow-Tutorial-Wan-Video/nodes.py at main · The-Pocket/PocketFlow-Tutorial-Wan-Video · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
import os
import yaml
from pocketflow import Node, BatchNode
from utils.call_llm import call_llm
from utils.ali_api import generate_image, animate_image
from utils.audio import generate_audio
from utils.ffmpeg import merge_audio_video, concat_videos

CHARACTER_DESC = {
    "Ding Ding Dog": "A cute blue robotic puppy with big floppy dog ears, a short dog snout, round black nose, small wagging tail, stubby paws, and a golden bell on its red collar. Walks on two legs and has a magic pocket on its round belly. Friendly expression with tongue slightly out.",
    "Mia": "A cheerful girl with pigtails and round glasses.",
}

IMAGE_STYLE = "Japanese children anime cartoon style, clean line art, bright pastel colors, simple cute character design, wide 16:9 composition"


# ── 1. GenerateScenes ──────────────────────────────────────────────

class GenerateScenesNode(Node):
    def prep(self, shared):
        path = shared["md_path"]
        with open(path, "r", encoding="utf-8") as f:
            shared["md_content"] = f.read()
        return shared["md_content"]

    def exec(self, content):
        prompt = f"""You are a cartoon scene planner. Given this technical article, plan 4-8 scenes for an educational cartoon.

CHARACTERS:
- Mia: {CHARACTER_DESC['Mia']} She's a student who struggles with the topic and asks questions.
- Ding Ding Dog: {CHARACTER_DESC['Ding Ding Dog']} He's the wise helper who explains concepts with gadgets and analogies.

RULES:
- Each scene has ONE speaker only. The conversation alternates across scenes.
- Typically: Mia speaks (confused/curious) → Ding Ding Dog speaks (explains) → Mia (follow-up question) → Ding Ding Dog (deeper explanation) → ...
- Early scenes: Mia is frustrated/confused, motivating the problem
- Later scenes: Ding Ding Dog explains with gadgets, analogies, and visual aids
- Final scene: Mia celebrates understanding

ARTICLE:
{content}

Output a YAML list of scene descriptions. Each scene should note who speaks and what happens.
```yaml
scenes:
  - speaker: "Mia"
    description: "Mia is sitting at her desk looking frustrated, surrounded by homework papers about neural networks"
  - speaker: "Ding Ding Dog"
    description: "Ding Ding Dog pulls a glowing brain gadget from his belly pocket and starts explaining"
```"""
        use_cache = self.cur_retry == 0
        response = call_llm(prompt, use_cache=use_cache)
        yaml_str = response.split("```yaml")[1].split("```")[0].strip()
        result = yaml.safe_load(yaml_str)
        scenes = result["scenes"]
        assert isinstance(scenes, list) and len(scenes) >= 2
        return scenes

    def post(self, shared, prep_res, exec_res):
        shared["scenes"] = exec_res
        shared["scripts"] = []
        shared["images"] = []
        shared["audios"] = []
        shared["videos"] = []
        shared["current_idx"] = 0
        print(f"Planned {len(exec_res)} scenes:")
        for i, s in enumerate(exec_res, 1):
            print(f"  {i}. [{s['speaker']}] {s['description'][:70]}...")


# ── 2. GenerateScript (self-loop) ──────────────────────────────────

class GenerateScriptNode(Node):
    def prep(self, shared):
        idx = shared["current_idx"]
        if idx >= len(shared["scenes"]):
            return None
        return {
            "scene": shared["scenes"][idx],
            "all_scenes": shared["scenes"],
            "md_content": shared["md_content"],
            "previous_scripts": shared["scripts"].copy(),
        }

    def exec(self, data):
        if data is None:
            return None
        scene = data["scene"]
        all_scenes = data["all_scenes"]
        previous = data["previous_scripts"]

        scenes_list = "\n".join(
            f"  {i+1}. [{s['speaker']}] {s['description']}" for i, s in enumerate(all_scenes)
        )
        prev_text = "\n\n".join(
            f"Scene {i+1} [{s['speaker']}]: {s['text']}" for i, s in enumerate(previous)
        ) if previous else "(This is the first scene)"

        prompt = f"""You are a cartoon script writer. Write the script for ONE scene of an educational cartoon.

CHARACTERS:
- Mia: {CHARACTER_DESC['Mia']}
- Ding Ding Dog: {CHARACTER_DESC['Ding Ding Dog']}

ALL SCENES (for context):
{scenes_list}

PREVIOUS SCENE SCRIPTS:
{prev_text}

CURRENT SCENE:
Speaker: {scene['speaker']}
Description: {scene['description']}

ORIGINAL ARTICLE (for technical accuracy):
{data['md_content']}

RULES:
1. Write 1-2 SHORT sentences for this speaker (MAX 40 words). Conversational, warm, like explaining to a friend. Keep it punchy — one key idea per scene.
2. This is a CONTINUATION of the conversation from previous scenes — don't repeat what was already said, build on it naturally.
3. For image_prompt: MUST start with "{IMAGE_STYLE}:" and include BOTH character descriptions — both Mia and Ding Ding Dog should appear in the scene together (the speaker is the focus, the other reacts).
   CRITICAL — EVERY SCENE MUST HAVE A DISTINCT VISUAL COMPOSITION:
   - Different setting/location (bedroom, classroom, park, lab, kitchen, rooftop, etc.)
   - Different camera angle (wide shot, close-up, over-the-shoulder, bird's-eye, low angle)
   - Different character poses and actions (standing, sitting, pointing at board, holding gadget, jumping, etc.)
   - Different props and visual aids relevant to the topic being discussed
   The reference image is ONLY for character design consistency — the scene, angle, background, and composition must be completely different each time.
4. For animation_prompt: describe camera movement and character motion — make it dynamic.

Output in YAML:
```yaml
speaker: "{scene['speaker']}"
text: "What the speaker says..."
image_prompt: "{IMAGE_STYLE}: ..."
animation_prompt: "Camera movement description..."
```"""
        use_cache = self.cur_retry == 0
        response = call_llm(prompt, use_cache=use_cache)
        yaml_str = response.split("```yaml")[1].split("```")[0].strip()
        result = yaml.safe_load(yaml_str)
        assert "speaker" in result and "text" in result and "image_prompt" in result and "animation_prompt" in result
        return result

    def post(self, shared, prep_res, exec_res):
        if exec_res is None:
            shared["current_idx"] = 0
            return "done"
        shared["scripts"].append(exec_res)
        shared["current_idx"] += 1
        idx = shared["current_idx"]
        total = len(shared["scenes"])
        print(f"Script {idx}/{total} [{exec_res['speaker']}]: {exec_res['text'][:60]}...")
        if idx >= total:
            shared["current_idx"] = 0
            return "done"
        return "next"


# ── 3. GenerateImage (Batch) ──────────────────────────────────────

class GenerateImageNode(BatchNode):
    def prep(self, shared):
        self._shared = shared
        return shared["scripts"]

    def exec(self, script):
        idx = len(self._shared["images"])
        output_dir = self._shared["output_dir"]
        path = os.path.join(output_dir, f"{idx + 1}.png")

        prompt = script["image_prompt"]
        if not prompt.startswith(IMAGE_STYLE):
            prompt = f"{IMAGE_STYLE}: {prompt}"
        prompt += " IMPORTANT: Use the first reference image for character design consistency. Use the second reference image (if present) for environment/color style continuity. Keep characters identical across scenes. But change the camera ANGLE, character POSE, and COMPOSITION — do NOT copy the same framing."

        refs = [self._shared["ref_image"]]
        if self._shared["images"]:
            refs.append(self._shared["images"][-1])
        result = generate_image(prompt, path, ref_image_paths=refs)
        self._shared["images"].append(result)
        print(f"Image {idx + 1}/{len(self._shared['scripts'])} done")
        return result

    def post(self, shared, prep_res, exec_res):
        shared["images"] = self._shared["images"]


# ── 4. GenerateAudio (Batch) ──────────────────────────────────────

class GenerateAudioNode(BatchNode):
    def prep(self, shared):
        self._shared = shared
        return shared["scripts"]

    def exec(self, script):
        idx = len(self._shared["audios"])
        output_dir = self._shared["output_dir"]
        path = os.path.join(output_dir, f"{idx + 1}.mp3")
        result = generate_audio(script["text"], script["speaker"], path)
        self._shared["audios"].append(result)
        print(f"Audio {idx + 1}/{len(self._shared['scripts'])} done")
        return result

    def post(self, shared, prep_res, exec_res):
        shared["audios"] = self._shared["audios"]


# ── 5. AnimateVideo (Batch) ───────────────────────────────────────

class AnimateVideoNode(BatchNode):
    def prep(self, shared):
        self._shared = shared
        return list(zip(shared["images"], shared["scripts"], shared["audios"]))

    def exec(self, item):
        image_path, script, audio_path = item
        idx = len(self._shared["videos"])
        output_dir = self._shared["output_dir"]
        path = os.path.join(output_dir, f"{idx + 1}.mp4")

        import subprocess, json as _json
        probe = subprocess.run(
            ["ffprobe", "-v", "quiet", "-print_format", "json", "-show_format", audio_path],
            capture_output=True, text=True,
        )
        audio_dur = float(_json.loads(probe.stdout)["format"]["duration"])
        duration = min(int(audio_dur) + 1, 15)
        print(f"  Audio duration: {audio_dur:.1f}s -> video duration: {duration}s")

        result = animate_image(image_path, script["animation_prompt"], path, duration=duration)
        self._shared["videos"].append(result)
        print(f"Video {idx + 1}/{len(self._shared['scripts'])} done")
        return result

    def post(self, shared, prep_res, exec_res):
        shared["videos"] = self._shared["videos"]


# ── 6. Combine ────────────────────────────────────────────────────

class CombineNode(Node):
    def prep(self, shared):
        return {
            "videos": shared["videos"],
            "audios": shared["audios"],
            "output_dir": shared["output_dir"],
        }

    def exec(self, data):
        output_dir = data["output_dir"]
        combined = []
        for i, (vp, ap) in enumerate(zip(data["videos"], data["audios"])):
            out = os.path.join(output_dir, f"{i + 1}_combined.mp4")
            merge_audio_video(vp, ap, out)
            combined.append(out)

        final = os.path.join(output_dir, "final.mp4")
        if len(combined) == 1:
            import shutil
            shutil.copy2(combined[0], final)
        else:
            concat_videos(combined, final)
        return final

    def post(self, shared, prep_res, exec_res):
        shared["final_video"] = exec_res
        print(f"\nFinal video: {exec_res}")