44the results as WTF (World Transcription Format) analysis entries.
55
66The vfun server provides:
7- - Multi-language speech recognition (English + auto-detect)
8- - Speaker diarization (who spoke when)
7+ - Multi-language speech recognition (English + Spanish, auto-detect)
98- GPU-accelerated processing with CUDA
109
1110Configuration options:
1211 vfun-server-url: URL of the vfun transcription server (required)
13- diarize: Enable speaker diarization (default: true)
12+ language: Language override ("en" or "es"). If omitted, vfun auto-detects.
13+ diarize: Enable speaker diarization (default: False)
1414 timeout: Request timeout in seconds (default: 300)
15- min-duration: Minimum dialog duration to transcribe in seconds (default: 5 )
15+ min-duration: Minimum dialog duration to transcribe in seconds (default: 0 )
1616 api-key: Optional API key for vfun server authentication
1717
1818Example configuration in config.yml:
1919 wtf_transcribe:
2020 module: links.wtf_transcribe
2121 options:
22- vfun-server-url: http://localhost:8443/transcribe
22+ vfun-server-url: http://localhost:4380/wtf
23+ language: en
2324 diarize: true
2425 timeout: 300
2526 min-duration: 5
2930import base64
3031import json
3132import logging
32- import os
33- import tempfile
3433import requests
35- from datetime import datetime , timezone
36- from typing import Optional , Dict , Any , List
34+ from typing import Optional , Dict , Any
3735
3836from server .lib .vcon_redis import VconRedis
3937from lib .logging_utils import init_logger
4442
4543default_options = {
4644 "vfun-server-url" : None ,
47- "diarize" : True ,
45+ "language" : None ,
46+ "diarize" : False ,
4847 "timeout" : 300 ,
49- "min-duration" : 5 ,
48+ "min-duration" : 0 ,
5049 "api-key" : None ,
5150}
5251
@@ -107,113 +106,23 @@ def get_audio_content(dialog: Dict[str, Any]) -> Optional[bytes]:
107106def create_wtf_analysis (
108107 dialog_index : int ,
109108 vfun_response : Dict [str , Any ],
110- duration : float ,
109+ language : Optional [ str ] = None ,
111110) -> Dict [str , Any ]:
112- """Create a WTF analysis entry from vfun response."""
113- now = datetime .now (timezone .utc ).isoformat ()
114-
115- # Extract text and segments from vfun response
116- # vfun returns: analysis[].body with transcription data
117- analysis_entries = vfun_response .get ("analysis" , [])
118-
119- full_text = ""
120- segments = []
121- language = "en-US"
122-
123- for entry in analysis_entries :
124- if entry .get ("type" ) in ("transcription" , "wtf_transcription" ):
125- body = entry .get ("body" , {})
126-
127- # Handle different response formats
128- if isinstance (body , dict ):
129- # WTF format from vfun
130- transcript = body .get ("transcript" , {})
131- full_text = transcript .get ("text" , body .get ("text" , "" ))
132- language = transcript .get ("language" , body .get ("language" , "en-US" ))
133- segments = body .get ("segments" , [])
134- elif isinstance (body , str ):
135- full_text = body
136- break
137-
138- # If no analysis found, check for direct text field
139- if not full_text :
140- full_text = vfun_response .get ("text" , "" )
141- segments = vfun_response .get ("segments" , [])
142-
143- # Calculate confidence
144- if segments :
145- confidences = [s .get ("confidence" , 0.9 ) for s in segments ]
146- avg_confidence = sum (confidences ) / len (confidences )
147- else :
148- avg_confidence = 0.9
149-
150- # Build WTF segments
151- wtf_segments = []
152- for i , seg in enumerate (segments ):
153- wtf_seg = {
154- "id" : seg .get ("id" , i ),
155- "start" : float (seg .get ("start" , seg .get ("start_time" , 0.0 ))),
156- "end" : float (seg .get ("end" , seg .get ("end_time" , 0.0 ))),
157- "text" : seg .get ("text" , seg .get ("transcription" , "" )),
158- "confidence" : float (seg .get ("confidence" , 0.9 )),
159- }
160- if "speaker" in seg :
161- wtf_seg ["speaker" ] = seg ["speaker" ]
162- wtf_segments .append (wtf_seg )
163-
164- # Build speakers section
165- speakers = {}
166- for seg in wtf_segments :
167- speaker = seg .get ("speaker" )
168- if speaker is not None :
169- speaker_key = str (speaker )
170- if speaker_key not in speakers :
171- speakers [speaker_key ] = {
172- "id" : speaker ,
173- "label" : f"Speaker { speaker } " ,
174- "segments" : [],
175- "total_time" : 0.0 ,
176- }
177- speakers [speaker_key ]["segments" ].append (seg ["id" ])
178- speakers [speaker_key ]["total_time" ] += seg ["end" ] - seg ["start" ]
179-
180- # Build WTF body
181- wtf_body = {
182- "transcript" : {
183- "text" : full_text ,
184- "language" : language ,
185- "duration" : float (duration ),
186- "confidence" : float (avg_confidence ),
187- },
188- "segments" : wtf_segments ,
189- "metadata" : {
190- "created_at" : now ,
191- "processed_at" : now ,
192- "provider" : "vfun" ,
193- "model" : "parakeet-tdt-110m" ,
194- "audio" : {
195- "duration" : float (duration ),
196- },
197- },
198- "quality" : {
199- "average_confidence" : float (avg_confidence ),
200- "multiple_speakers" : len (speakers ) > 1 ,
201- "low_confidence_words" : sum (1 for s in wtf_segments if s .get ("confidence" , 1.0 ) < 0.5 ),
202- },
203- }
111+ """Create a WTF analysis entry from vfun response.
204112
205- if speakers :
206- wtf_body ["speakers" ] = speakers
113+ vfun returns a WTF-compliant body directly. If language is set in
114+ config, it is added to the transcript object.
115+ """
116+ if language and "transcript" in vfun_response :
117+ vfun_response ["transcript" ]["language" ] = language
207118
208119 return {
209120 "type" : "wtf_transcription" ,
210121 "dialog" : dialog_index ,
211122 "mediatype" : "application/json" ,
212123 "vendor" : "vfun" ,
213- "product" : "parakeet-tdt-110m" ,
214124 "schema" : "wtf-1.0" ,
215- # Note: encoding omitted since body is a direct object, not a JSON string
216- "body" : wtf_body ,
125+ "body" : vfun_response ,
217126 }
218127
219128
@@ -247,7 +156,7 @@ def run(
247156 dialogs_skipped = 0
248157
249158 for i , dialog in enumerate (vcon .dialog ):
250- if not should_transcribe_dialog (dialog , opts .get ("min-duration" , 5 )):
159+ if not should_transcribe_dialog (dialog , opts .get ("min-duration" , 0 )):
251160 logger .debug (f"Skipping dialog { i } (not eligible)" )
252161 dialogs_skipped += 1
253162 continue
@@ -278,11 +187,13 @@ def run(
278187 mimetype = dialog .get ("mimetype" , "audio/wav" )
279188
280189 # Send audio to vfun server
281- files = {"file" : (filename , audio_content , mimetype )}
190+ files = {"file-binary " : (filename , audio_content , mimetype )}
282191 data = {
283- "diarize" : str (opts .get ("diarize" , True )),
284- "block" : "true" ,
192+ "diarize" : str (opts .get ("diarize" , True )).lower (),
285193 }
194+ language = opts .get ("language" )
195+ if language :
196+ data ["language" ] = language
286197
287198 response = requests .post (
288199 vfun_server_url ,
@@ -292,14 +203,13 @@ def run(
292203 timeout = opts .get ("timeout" , 300 ),
293204 )
294205
295- if response .status_code in ( 200 , 302 ) :
206+ if response .status_code == 200 :
296207 vfun_response = response .json ()
297208 # Handle double-encoded JSON (vfun sometimes returns JSON string)
298209 if isinstance (vfun_response , str ):
299210 vfun_response = json .loads (vfun_response )
300211
301- duration = dialog .get ("duration" , 30.0 )
302- wtf_analysis = create_wtf_analysis (i , vfun_response , float (duration ))
212+ wtf_analysis = create_wtf_analysis (i , vfun_response , language = opts .get ("language" ))
303213
304214 # Add analysis to vCon
305215 vcon .add_analysis (
@@ -309,7 +219,6 @@ def run(
309219 body = wtf_analysis ["body" ],
310220 extra = {
311221 "mediatype" : wtf_analysis .get ("mediatype" ),
312- "product" : wtf_analysis .get ("product" ),
313222 "schema" : wtf_analysis .get ("schema" ),
314223 },
315224 )
0 commit comments