Skip to content

Commit 64f86da

Browse files
authored
feat: add speech2text (#1855)
- A new stt driver in `AIInterfaceService` and `PuterAIModule`. - added methods for audio transcription and translation in the speech-to-text interface. - updated cost mapping for stt models in `openAiCostMap.ts`. - Updated permissions and interfaces to support new speech-to-text features.
1 parent b6af2df commit 64f86da

File tree

10 files changed

+661
-82
lines changed

10 files changed

+661
-82
lines changed

package-lock.json

Lines changed: 97 additions & 82 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/backend/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@
6161
"multer": "^2.0.2",
6262
"multi-progress": "^4.0.0",
6363
"murmurhash": "^2.0.1",
64+
"music-metadata": "^7.14.0",
6465
"nodemailer": "^6.9.3",
6566
"on-finished": "^2.4.1",
6667
"openai": "^6.7.0",

src/backend/src/data/hardcoded-permissions.js

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ const default_implicit_user_app_permissions = {
2525
'driver:puter-image-generation': {},
2626
'driver:puter-video-generation': {},
2727
'driver:puter-tts': {},
28+
'driver:puter-speech2txt': {},
2829
'driver:puter-apps': {},
2930
'driver:puter-subdomains': {},
3031
'driver:temp-email': {},
@@ -60,6 +61,8 @@ const implicit_user_app_permissions = [
6061
'driver:puter-chat-completion:complete': {},
6162
'driver:puter-image-generation:generate': {},
6263
'driver:puter-video-generation:generate': {},
64+
'driver:puter-speech2txt:transcribe': {},
65+
'driver:puter-speech2txt:translate': {},
6366
'driver:puter-analytics:create_trace': {},
6467
'driver:puter-analytics:record': {},
6568
},

src/backend/src/modules/puterai/AIInterfaceService.js

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,50 @@ class AIInterfaceService extends BaseService {
205205
},
206206
}
207207
})
208+
209+
col_interfaces.set('puter-speech2txt', {
210+
description: 'Speech to text transcription and translation.',
211+
methods: {
212+
list_models: {
213+
description: 'List available speech-to-text models.',
214+
result: { type: 'json' },
215+
},
216+
transcribe: {
217+
description: 'Transcribe audio into text.',
218+
parameters: {
219+
file: { type: 'file' },
220+
model: { type: 'string', optional: true },
221+
response_format: { type: 'string', optional: true },
222+
language: { type: 'string', optional: true },
223+
prompt: { type: 'string', optional: true },
224+
temperature: { type: 'number', optional: true },
225+
logprobs: { type: 'flag', optional: true },
226+
timestamp_granularities: { type: 'json', optional: true },
227+
stream: { type: 'flag', optional: true },
228+
chunking_strategy: { type: 'string', optional: true },
229+
known_speaker_names: { type: 'json', optional: true },
230+
known_speaker_references: { type: 'json', optional: true },
231+
extra_body: { type: 'json', optional: true },
232+
},
233+
result: { type: 'json' },
234+
},
235+
translate: {
236+
description: 'Translate audio into English text.',
237+
parameters: {
238+
file: { type: 'file' },
239+
model: { type: 'string', optional: true },
240+
response_format: { type: 'string', optional: true },
241+
prompt: { type: 'string', optional: true },
242+
temperature: { type: 'number', optional: true },
243+
logprobs: { type: 'flag', optional: true },
244+
timestamp_granularities: { type: 'json', optional: true },
245+
stream: { type: 'flag', optional: true },
246+
extra_body: { type: 'json', optional: true },
247+
},
248+
result: { type: 'json' },
249+
},
250+
},
251+
});
208252
}
209253
}
210254

0 commit comments

Comments
 (0)