Skip to content

Commit 3c82e44

Browse files
pavanputhraclaude
andcommitted
Update wtf_transcribe to align with new vfun /wtf API
- Adds language override option ("en"/"es") for Spanish support - Simplifies create_wtf_analysis to pass vfun WTF-compliant response directly - Renames multipart field from "file" to "file-binary" per new API contract - Fixes diarize boolean to send lowercase string ("true"/"false") - Removes block:true param no longer needed by new endpoint - Changes diarize default to False (opt-in), min-duration default to 0 - Drops unused imports (os, tempfile, datetime, List) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent cbf1734 commit 3c82e44

1 file changed

Lines changed: 26 additions & 117 deletions

File tree

server/links/wtf_transcribe/__init__.py

Lines changed: 26 additions & 117 deletions
Original file line numberDiff line numberDiff line change
@@ -4,22 +4,23 @@
44
the results as WTF (World Transcription Format) analysis entries.
55
66
The vfun server provides:
7-
- Multi-language speech recognition (English + auto-detect)
8-
- Speaker diarization (who spoke when)
7+
- Multi-language speech recognition (English + Spanish, auto-detect)
98
- GPU-accelerated processing with CUDA
109
1110
Configuration options:
1211
vfun-server-url: URL of the vfun transcription server (required)
13-
diarize: Enable speaker diarization (default: true)
12+
language: Language override ("en" or "es"). If omitted, vfun auto-detects.
13+
diarize: Enable speaker diarization (default: False)
1414
timeout: Request timeout in seconds (default: 300)
15-
min-duration: Minimum dialog duration to transcribe in seconds (default: 5)
15+
min-duration: Minimum dialog duration to transcribe in seconds (default: 0)
1616
api-key: Optional API key for vfun server authentication
1717
1818
Example configuration in config.yml:
1919
wtf_transcribe:
2020
module: links.wtf_transcribe
2121
options:
22-
vfun-server-url: http://localhost:8443/transcribe
22+
vfun-server-url: http://localhost:4380/wtf
23+
language: en
2324
diarize: true
2425
timeout: 300
2526
min-duration: 5
@@ -29,11 +30,8 @@
2930
import base64
3031
import json
3132
import logging
32-
import os
33-
import tempfile
3433
import requests
35-
from datetime import datetime, timezone
36-
from typing import Optional, Dict, Any, List
34+
from typing import Optional, Dict, Any
3735

3836
from server.lib.vcon_redis import VconRedis
3937
from lib.logging_utils import init_logger
@@ -44,9 +42,10 @@
4442

4543
default_options = {
4644
"vfun-server-url": None,
47-
"diarize": True,
45+
"language": None,
46+
"diarize": False,
4847
"timeout": 300,
49-
"min-duration": 5,
48+
"min-duration": 0,
5049
"api-key": None,
5150
}
5251

@@ -107,113 +106,23 @@ def get_audio_content(dialog: Dict[str, Any]) -> Optional[bytes]:
107106
def create_wtf_analysis(
108107
dialog_index: int,
109108
vfun_response: Dict[str, Any],
110-
duration: float,
109+
language: Optional[str] = None,
111110
) -> Dict[str, Any]:
112-
"""Create a WTF analysis entry from vfun response."""
113-
now = datetime.now(timezone.utc).isoformat()
114-
115-
# Extract text and segments from vfun response
116-
# vfun returns: analysis[].body with transcription data
117-
analysis_entries = vfun_response.get("analysis", [])
118-
119-
full_text = ""
120-
segments = []
121-
language = "en-US"
122-
123-
for entry in analysis_entries:
124-
if entry.get("type") in ("transcription", "wtf_transcription"):
125-
body = entry.get("body", {})
126-
127-
# Handle different response formats
128-
if isinstance(body, dict):
129-
# WTF format from vfun
130-
transcript = body.get("transcript", {})
131-
full_text = transcript.get("text", body.get("text", ""))
132-
language = transcript.get("language", body.get("language", "en-US"))
133-
segments = body.get("segments", [])
134-
elif isinstance(body, str):
135-
full_text = body
136-
break
137-
138-
# If no analysis found, check for direct text field
139-
if not full_text:
140-
full_text = vfun_response.get("text", "")
141-
segments = vfun_response.get("segments", [])
142-
143-
# Calculate confidence
144-
if segments:
145-
confidences = [s.get("confidence", 0.9) for s in segments]
146-
avg_confidence = sum(confidences) / len(confidences)
147-
else:
148-
avg_confidence = 0.9
149-
150-
# Build WTF segments
151-
wtf_segments = []
152-
for i, seg in enumerate(segments):
153-
wtf_seg = {
154-
"id": seg.get("id", i),
155-
"start": float(seg.get("start", seg.get("start_time", 0.0))),
156-
"end": float(seg.get("end", seg.get("end_time", 0.0))),
157-
"text": seg.get("text", seg.get("transcription", "")),
158-
"confidence": float(seg.get("confidence", 0.9)),
159-
}
160-
if "speaker" in seg:
161-
wtf_seg["speaker"] = seg["speaker"]
162-
wtf_segments.append(wtf_seg)
163-
164-
# Build speakers section
165-
speakers = {}
166-
for seg in wtf_segments:
167-
speaker = seg.get("speaker")
168-
if speaker is not None:
169-
speaker_key = str(speaker)
170-
if speaker_key not in speakers:
171-
speakers[speaker_key] = {
172-
"id": speaker,
173-
"label": f"Speaker {speaker}",
174-
"segments": [],
175-
"total_time": 0.0,
176-
}
177-
speakers[speaker_key]["segments"].append(seg["id"])
178-
speakers[speaker_key]["total_time"] += seg["end"] - seg["start"]
179-
180-
# Build WTF body
181-
wtf_body = {
182-
"transcript": {
183-
"text": full_text,
184-
"language": language,
185-
"duration": float(duration),
186-
"confidence": float(avg_confidence),
187-
},
188-
"segments": wtf_segments,
189-
"metadata": {
190-
"created_at": now,
191-
"processed_at": now,
192-
"provider": "vfun",
193-
"model": "parakeet-tdt-110m",
194-
"audio": {
195-
"duration": float(duration),
196-
},
197-
},
198-
"quality": {
199-
"average_confidence": float(avg_confidence),
200-
"multiple_speakers": len(speakers) > 1,
201-
"low_confidence_words": sum(1 for s in wtf_segments if s.get("confidence", 1.0) < 0.5),
202-
},
203-
}
111+
"""Create a WTF analysis entry from vfun response.
204112
205-
if speakers:
206-
wtf_body["speakers"] = speakers
113+
vfun returns a WTF-compliant body directly. If language is set in
114+
config, it is added to the transcript object.
115+
"""
116+
if language and "transcript" in vfun_response:
117+
vfun_response["transcript"]["language"] = language
207118

208119
return {
209120
"type": "wtf_transcription",
210121
"dialog": dialog_index,
211122
"mediatype": "application/json",
212123
"vendor": "vfun",
213-
"product": "parakeet-tdt-110m",
214124
"schema": "wtf-1.0",
215-
# Note: encoding omitted since body is a direct object, not a JSON string
216-
"body": wtf_body,
125+
"body": vfun_response,
217126
}
218127

219128

@@ -247,7 +156,7 @@ def run(
247156
dialogs_skipped = 0
248157

249158
for i, dialog in enumerate(vcon.dialog):
250-
if not should_transcribe_dialog(dialog, opts.get("min-duration", 5)):
159+
if not should_transcribe_dialog(dialog, opts.get("min-duration", 0)):
251160
logger.debug(f"Skipping dialog {i} (not eligible)")
252161
dialogs_skipped += 1
253162
continue
@@ -278,11 +187,13 @@ def run(
278187
mimetype = dialog.get("mimetype", "audio/wav")
279188

280189
# Send audio to vfun server
281-
files = {"file": (filename, audio_content, mimetype)}
190+
files = {"file-binary": (filename, audio_content, mimetype)}
282191
data = {
283-
"diarize": str(opts.get("diarize", True)),
284-
"block": "true",
192+
"diarize": str(opts.get("diarize", True)).lower(),
285193
}
194+
language = opts.get("language")
195+
if language:
196+
data["language"] = language
286197

287198
response = requests.post(
288199
vfun_server_url,
@@ -292,14 +203,13 @@ def run(
292203
timeout=opts.get("timeout", 300),
293204
)
294205

295-
if response.status_code in (200, 302):
206+
if response.status_code == 200:
296207
vfun_response = response.json()
297208
# Handle double-encoded JSON (vfun sometimes returns JSON string)
298209
if isinstance(vfun_response, str):
299210
vfun_response = json.loads(vfun_response)
300211

301-
duration = dialog.get("duration", 30.0)
302-
wtf_analysis = create_wtf_analysis(i, vfun_response, float(duration))
212+
wtf_analysis = create_wtf_analysis(i, vfun_response, language=opts.get("language"))
303213

304214
# Add analysis to vCon
305215
vcon.add_analysis(
@@ -309,7 +219,6 @@ def run(
309219
body=wtf_analysis["body"],
310220
extra={
311221
"mediatype": wtf_analysis.get("mediatype"),
312-
"product": wtf_analysis.get("product"),
313222
"schema": wtf_analysis.get("schema"),
314223
},
315224
)

0 commit comments

Comments
 (0)