openai-phoneme-exploration/pronounce.py at main · georgemandis/openai-phoneme-exploration · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
from pathlib import Path
from openai import OpenAI
from pydantic import BaseModel, Field
import argparse

client = OpenAI()

class Word(BaseModel):
    phonemes: list[str] = Field(description="A list of phonemes that compromise a given word")

def generate_html(word: str, phonemes: list[str]) -> str:
    html_content = f"""<!DOCTYPE html>
<html>
<head>
    <title>Pronunciation of {word}</title>
    <style>
        body {{
            font-family: Arial, sans-serif;
            max-width: 800px;
            margin: 0 auto;
            padding: 20px;
        }}
        .word {{
            font-size: 2em;
            margin-bottom: 20px;
        }}
        .phoneme {{
            margin: 10px 0;
            padding: 10px;
            background: #f5f5f5;
            border-radius: 5px;
        }}
        .play-all {{
            margin: 20px 0;
            padding: 10px 20px;
            background: #4CAF50;
            color: white;
            border: none;
            border-radius: 5px;
            cursor: pointer;
        }}
        .play-all:hover {{
            background: #45a049;
        }}
    </style>
</head>
<body>
    <div class="word">{word}</div>
    <button class="play-all" onclick="playAll()">Play All Phonemes</button>
    <div id="phonemes">
"""

    for phoneme in phonemes:
        audio_path = f"../sounds/{phoneme}.mp3"
        html_content += f"""
        <div class="phoneme">
            <span>{phoneme}</span>
            <audio id="audio-{phoneme}" src="{audio_path}"></audio>
            <button onclick="playPhoneme('{phoneme}')">Play</button>
        </div>"""

    html_content += """
    </div>
    <script>
        function playPhoneme(phoneme) {
            const audio = document.getElementById(`audio-${phoneme}`);
            audio.currentTime = 0;
            audio.play();
        }

        async function playAll() {
            const phonemes = document.querySelectorAll('.phoneme');
            for (const phoneme of phonemes) {
                const audio = phoneme.querySelector('audio');
                audio.currentTime = 0;
                await new Promise(resolve => {
                    audio.onended = resolve;
                    audio.play();
                });
            }
        }
    </script>
</body>
</html>"""
    return html_content

def main():
    parser = argparse.ArgumentParser(description='Generate phoneme pronunciations for a word')
    parser.add_argument('word', type=str, help='The word to generate phonemes for')
    args = parser.parse_args()

    response = client.responses.parse(
        model="gpt-4o-2024-08-06",
        input=[
            {
                "role": "system",
                "content": "Given a provided word return a list of phoenemes in the latin alphabet that compromise how to pronounce the word.",
            },
            {"role": "user", "content": args.word},
        ],
        text_format=Word,
    )

    phonemes = response.output_parsed.phonemes

    # Create sounds directory if it doesn't exist
    sounds_dir = Path(__file__).parent / "sounds"
    sounds_dir.mkdir(exist_ok=True)

    # Create words directory if it doesn't exist
    words_dir = Path(__file__).parent / "words"
    words_dir.mkdir(exist_ok=True)

    # Generate audio files
    for phoneme in phonemes:
        speech_file_path = sounds_dir / f"{phoneme}.mp3"

        if not speech_file_path.exists():
            with client.audio.speech.with_streaming_response.create(
                model="gpt-4o-mini-tts",
                voice="coral",
                input=phoneme,
                instructions="Pronounce only the provided phoneme clearly and deliberately, with good anunciation.",
            ) as response:
                response.stream_to_file(speech_file_path)

        print(f"{phoneme} -> {speech_file_path}")

    # Generate HTML file
    html_file_path = words_dir / f"{args.word}.html"
    if not html_file_path.exists():
        html_content = generate_html(args.word, phonemes)
        html_file_path.write_text(html_content)
        print(f"Generated HTML file: {html_file_path}")

if __name__ == "__main__":
    main()

# print(phonemes)

# # generate a file pronouncing phonemes