Skip to content

Commit 9c2b752

Browse files
committed
Phase 7 покрытие тестом recognition 42% (общий 84%)
1 parent e8ebefe commit 9c2b752

File tree

11 files changed

+305
-32
lines changed

11 files changed

+305
-32
lines changed

config/default.yaml

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,9 +128,16 @@ recognition:
128128
model: "speechbrain/spkrec-ecapa-voxceleb" # RECOGNITION_MODEL
129129
device: "cpu" # RECOGNITION_DEVICE
130130
threshold: 0.7 # RECOGNITION_THRESHOLD
131-
embeddings_path: null # RECOGNITION_EMBEDDINGS_PATH
131+
embeddings_path: "./volume/models/embeddings" # RECOGNITION_EMBEDDINGS_PATH
132132
index_path: null # RECOGNITION_INDEX_PATH
133133

134+
voices:
135+
- name: alice_ivanova
136+
embedding: "./volume/models/embeddings/alice_ivanova.vec"
137+
phone: "+78001234567"
138+
description: "Голос Алисы Ивановой"
139+
140+
134141
# Этап CardDAV (связывание контактов)
135142
carddav:
136143
enabled: true # CARDDAV_ENABLED

src/app/annotation.py

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -167,35 +167,45 @@ def _build_final_annotation(
167167
# Обработка спикеров
168168
speakers_map = {}
169169
speaker_id = 0
170-
170+
171171
diarization_segments = diarization_result.payload.get("segments", []) if diarization_result else []
172172
recognition_speakers = recognition_result.payload.get("speakers", {}) if recognition_result else {}
173173
carddav_speakers = carddav_result.payload.get("speakers", {}) if carddav_result else {}
174-
174+
175+
# Подгрузка из config.voices для расширенного сопоставления
176+
known_voices = {v.name: v for v in getattr(self.config, "voices", [])}
177+
175178
for segment in diarization_segments:
176179
speaker_label = segment.get("speaker", "unknown")
177-
178180
if speaker_label not in speakers_map:
179181
speaker_id += 1
180182
speaker_info = FinalSpeaker(
181183
id=f"speaker_{speaker_id:02d}",
182184
label=speaker_label,
183185
segments_count=0,
184-
total_duration=0.0
186+
total_duration=0.0,
187+
voice_embedding=None,
188+
identified=False,
189+
confidence=0.0,
190+
name=None,
191+
contact_info=None,
185192
)
186-
193+
187194
# Добавление информации о распознавании
188195
recognition_info = recognition_speakers.get(speaker_label, {})
189196
if recognition_info:
190197
speaker_info.identified = recognition_info.get("identified", False)
191198
speaker_info.name = recognition_info.get("name")
192199
speaker_info.confidence = recognition_info.get("confidence", 0.0)
193-
200+
# Пробуем дополнить путь к эмбеддингу из известного голоса
201+
if speaker_info.name and speaker_info.name in known_voices:
202+
speaker_info.voice_embedding = known_voices[speaker_info.name].embedding
203+
194204
# Добавление информации из CardDAV
195205
carddav_info = carddav_speakers.get(speaker_label, {})
196206
if carddav_info and carddav_info.get("contact"):
197207
speaker_info.contact_info = carddav_info["contact"]
198-
208+
199209
speakers_map[speaker_label] = speaker_info
200210

201211
# Обработка сегментов и транскрипции

src/app/api/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
# src/app/api/__init__.py
22

33
from fastapi import APIRouter
4-
from .routers import health, jobs, ws
4+
from .routers import health, jobs, ws, voices
55

66
api_router = APIRouter()
77
api_router.include_router(health.router, prefix="/api/v1")
88
api_router.include_router(jobs.router, prefix="/api/v1/jobs", tags=["Jobs"])
9+
api_router.include_router(voices.router, prefix="/api/v1/voices", tags=["Voices"])
910
api_router.include_router(ws.router, prefix="/ws", tags=["WebSocket"])

src/app/api/routers/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
# src/app/api/routers/__init__.py
22

3-
from . import health, jobs, ws # noqa: F401
3+
from . import health, jobs, ws, voices # noqa: F401

src/app/api/routers/voices.py

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
# src/app/api/routers/voices.py
2+
"""
3+
REST API для управления известными голосами и эмбеддингами в CallAnnotate.
4+
5+
Автор: akoodoy@capilot.ru
6+
Ссылка: https://github.com/momentics/CallAnnotate
7+
Лицензия: Apache-2.0
8+
"""
9+
10+
from pathlib import Path
11+
from typing import List
12+
13+
from fastapi import (
14+
APIRouter,
15+
HTTPException,
16+
UploadFile,
17+
File,
18+
Form,
19+
status,
20+
Response,
21+
)
22+
23+
from ...schemas import VoiceInfo
24+
from ...config import load_settings
25+
from ...utils import ensure_directory
26+
27+
router = APIRouter()
28+
29+
CFG = load_settings()
30+
EMBEDDINGS_DIR = Path(
31+
CFG.recognition.embeddings_path or "./volume/models/embeddings"
32+
).resolve()
33+
34+
35+
def _embedding_file_path(name: str) -> Path:
36+
return EMBEDDINGS_DIR / f"{name}.vec"
37+
38+
39+
def _voice_exists(name: str) -> bool:
40+
return _embedding_file_path(name).exists()
41+
42+
43+
@router.get("/", response_model=List[VoiceInfo], tags=["Voices"])
44+
async def list_voices():
45+
"""Получить список известных голосов."""
46+
ensure_directory(str(EMBEDDINGS_DIR))
47+
voices: List[VoiceInfo] = [
48+
VoiceInfo(name=p.stem, embedding=str(p))
49+
for p in EMBEDDINGS_DIR.glob("*.vec")
50+
]
51+
return voices
52+
53+
54+
@router.post(
55+
"/", response_model=VoiceInfo, status_code=status.HTTP_201_CREATED, tags=["Voices"]
56+
)
57+
async def create_voice(
58+
name: str = Form(..., description="Имя голоса (уникальное, без пробелов)"),
59+
embedding_file: UploadFile = File(..., description="Файл эмбеддинга (.vec)"),
60+
):
61+
"""Добавить новый голос."""
62+
ensure_directory(str(EMBEDDINGS_DIR))
63+
64+
if not name.isidentifier():
65+
raise HTTPException(
66+
status.HTTP_400_BAD_REQUEST,
67+
"Имя должно быть валидным идентификатором (без пробелов и спецсимволов)",
68+
)
69+
if _voice_exists(name):
70+
raise HTTPException(
71+
status.HTTP_409_CONFLICT, f"Голос с именем '{name}' уже существует"
72+
)
73+
74+
content = await embedding_file.read()
75+
if not content:
76+
raise HTTPException(status.HTTP_400_BAD_REQUEST, "Пустой файл эмбеддинга")
77+
78+
path = _embedding_file_path(name)
79+
with open(path, "wb") as f:
80+
f.write(content)
81+
82+
return VoiceInfo(name=name, embedding=str(path))
83+
84+
85+
@router.get("/{name}", response_model=VoiceInfo, tags=["Voices"])
86+
async def get_voice(name: str):
87+
"""Получить информацию по голосу."""
88+
path = _embedding_file_path(name)
89+
if not path.exists():
90+
raise HTTPException(status.HTTP_404_NOT_FOUND, "Голос не найден")
91+
return VoiceInfo(name=name, embedding=str(path))
92+
93+
94+
@router.put("/{name}", response_model=VoiceInfo, tags=["Voices"])
95+
async def update_voice(
96+
name: str,
97+
embedding_file: UploadFile = File(..., description="Новый файл эмбеддинга (.vec)"),
98+
):
99+
"""Обновить эмбеддинг голоса."""
100+
path = _embedding_file_path(name)
101+
if not path.exists():
102+
raise HTTPException(status.HTTP_404_NOT_FOUND, "Голос не найден")
103+
104+
content = await embedding_file.read()
105+
if not content:
106+
raise HTTPException(status.HTTP_400_BAD_REQUEST, "Пустой файл эмбеддинга")
107+
108+
with open(path, "wb") as f:
109+
f.write(content)
110+
111+
return VoiceInfo(name=name, embedding=str(path))
112+
113+
114+
@router.delete("/{name}", status_code=status.HTTP_204_NO_CONTENT, tags=["Voices"])
115+
async def delete_voice(name: str):
116+
"""Удалить голос и файл эмбеддинга."""
117+
path = _embedding_file_path(name)
118+
if not path.exists():
119+
raise HTTPException(status.HTTP_404_NOT_FOUND, "Голос не найден")
120+
121+
try:
122+
path.unlink()
123+
except Exception as exc:
124+
raise HTTPException(
125+
status.HTTP_500_INTERNAL_SERVER_ERROR, f"Ошибка при удалении: {exc}"
126+
)
127+
128+
# Возврат 204 No Content без тела
129+
return Response(status_code=status.HTTP_204_NO_CONTENT)

src/app/config.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ class TranscriptionConfig(BaseSettings):
7373
language: str = Field("ru", description="Язык транскрипции или auto")
7474
batch_size: int = Field(16, gt=0, description="Размер пакета")
7575
task: str = Field("transcribe", description="Задача: transcribe или translate")
76+
7677
metrics: MetricsConfig = Field(default_factory=MetricsConfig, description="Настройки сбора метрик")
7778

7879
class Config:
@@ -169,6 +170,11 @@ class VoiceInfo(BaseModel):
169170
phone: Optional[str] = Field(None, description="Номер телефона")
170171
description: Optional[str] = Field(None, description="Описание")
171172

173+
class VoiceInfoConfig(BaseModel):
174+
name: str
175+
embedding: str
176+
phone: Optional[str] = None
177+
description: Optional[str] = None
172178

173179
class WebhookConfig(BaseModel):
174180
"""Конфигурация веб-хуков"""
@@ -325,6 +331,8 @@ class AppSettings(BaseSettings):
325331

326332
preprocess: PreprocessingConfig = Field(default_factory=PreprocessingConfig)
327333

334+
voices: List[VoiceInfoConfig] = Field(default_factory=list, description="Известные голоса")
335+
328336

329337
@validator('recognition')
330338
def validate_recognition_paths(cls, v):

src/app/schemas.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -306,3 +306,19 @@ class ErrorDetail(BaseModel):
306306
class ErrorResponse(BaseModel):
307307
"""Ответ с ошибкой"""
308308
error: ErrorDetail
309+
310+
311+
312+
class VoiceInfoBase(BaseModel):
313+
name: str = Field(..., description="Уникальное имя голоса")
314+
embedding: str = Field(..., description="Путь к файлу эмбеддинга")
315+
316+
class VoiceInfoCreate(VoiceInfoBase):
317+
pass
318+
319+
class VoiceInfoUpdate(BaseModel):
320+
embedding: str = Field(..., description="Путь к новому файлу эмбеддинга")
321+
322+
class VoiceInfo(VoiceInfoBase):
323+
class Config:
324+
orm_mode = True

src/app/stages/recognition.py

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,12 @@ async def _initialize(self):
3636
model_name = self.config.get("model", "speechbrain/spkrec-ecapa-voxceleb")
3737
device = self.config.get("device", "cpu")
3838
embeddings_path = self.config.get("embeddings_path")
39-
39+
self.index = None
40+
self.speaker_labels = {}
41+
42+
if embeddings_path and Path(embeddings_path).exists():
43+
self._load_speaker_database(embeddings_path)
44+
4045
self.logger.info(f"Загрузка модели распознавания: {model_name}")
4146

4247
if self.models_registry:
@@ -68,51 +73,46 @@ async def _initialize(self):
6873
self.logger.info("Модель распознавания загружена успешно")
6974

7075
def _load_speaker_database(self, embeddings_path: str):
71-
"""Загрузка базы данных голосовых эмбеддингов"""
76+
"""Загрузка библиотеки голосов и построение индекса FAISS"""
7277
try:
7378
embeddings_dir = Path(embeddings_path)
74-
75-
# Поиск файлов эмбеддингов
7679
embedding_files = list(embeddings_dir.glob("*.vec")) + list(embeddings_dir.glob("*.pkl"))
77-
7880
if not embedding_files:
79-
self.logger.warning(f"Не найдены файлы эмбеддингов в {embeddings_path}")
81+
self.logger.warning(f"В каталоге эмбеддингов не найдено файлов: {embeddings_path}")
8082
return
81-
83+
8284
embeddings = []
8385
labels = []
84-
86+
8587
for emb_file in embedding_files:
8688
try:
8789
if emb_file.suffix == '.pkl':
8890
with open(emb_file, 'rb') as f:
8991
embedding = pickle.load(f)
90-
else: # .vec файл
92+
else: # .vec
9193
embedding = np.loadtxt(emb_file)
92-
93-
# Имя спикера из имени файла
94+
9495
speaker_name = emb_file.stem
95-
96+
9697
embeddings.append(embedding)
9798
labels.append(speaker_name)
9899
self.speaker_labels[len(labels) - 1] = speaker_name
99-
100+
100101
except Exception as e:
101-
self.logger.error(f"Ошибка загрузки эмбеддинга {emb_file}: {e}")
102-
103-
# Создание FAISS индекса
102+
self.logger.error(f"Ошибка чтения эмбеддинга {emb_file}: {e}")
103+
104104
if faiss and embeddings:
105105
embeddings_matrix = np.vstack(embeddings).astype('float32')
106106
dimension = embeddings_matrix.shape[1]
107-
108-
self.index = faiss.IndexFlatIP(dimension) # Cosine similarity
107+
108+
self.index = faiss.IndexFlatIP(dimension) # Косинусное сходство
109109
faiss.normalize_L2(embeddings_matrix)
110110
self.index.add(embeddings_matrix)
111-
111+
112112
self.logger.info(f"Загружено {len(embeddings)} эмбеддингов голосов")
113113
else:
114-
self.logger.warning("FAISS не доступен или нет эмбеддингов")
115-
114+
self.logger.warning("FAISS не доступен или эмбеддингов нет")
115+
116116
except Exception as e:
117117
self.logger.error(f"Ошибка загрузки базы голосов: {e}")
118118

src/app/utils.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,14 @@
1010
import shutil
1111
import logging.config
1212
from pathlib import Path
13-
from typing import Dict, Any, Optional, Union
13+
from typing import Dict, Any, Optional, Union, List
1414
from fastapi import UploadFile
1515
from datetime import datetime, timedelta
1616

1717
import librosa
1818

1919
from .schemas import AudioMetadata
20+
from .schemas import VoiceInfo
2021

2122

2223
class ValidationResult:
@@ -271,3 +272,21 @@ def create_task_metadata(
271272
meta["websocket_client_id"] = websocket_client_id
272273
return meta
273274

275+
def load_known_voices_from_embeddings(embeddings_dir: str) -> List[VoiceInfo]:
276+
"""
277+
Загружает список известных голосов из каталога с эмбеддингами.
278+
279+
Args:
280+
embeddings_dir: путь к каталогу с *.vec файлами
281+
282+
Returns:
283+
Список VoiceInfo
284+
"""
285+
path = Path(embeddings_dir).expanduser().resolve()
286+
voices: List[VoiceInfo] = []
287+
if not path.exists() or not path.is_dir():
288+
return voices
289+
for f in path.glob("*.vec"):
290+
name = f.stem
291+
voices.append(VoiceInfo(name=name, embedding=str(f)))
292+
return voices

0 commit comments

Comments
 (0)