Skip to content

Commit 9f061a9

Browse files
authored
fix: Windows CUDA detection and speaker/confidence bugs (#9)
* feat: optimize CUDA/cuDNN stack and dependency management - Upgrade to CUDA 12.8 + cuDNN 9 (7.6% faster vs 12.6) - Remove cuDNN 8 detection (ctranslate2 >= 4.5.0 requires cuDNN 9) - Remove redundant torch/torchaudio/torchvision deps (inherit from core) - Tighten dependency floors to tested versions - Fix Dockerfile to use uv.lock for reproducible builds Dependency floor changes: - fastapi: >=0.100 → >=0.110 - uvicorn: >=0.20 → >=0.25 - pydantic: >=2.0 → >=2.5 - pydantic-settings: >=2.0 → >=2.3 - httpx: >=0.25 → >=0.27 - aiosqlite: >=0.19 → >=0.20 Council reviewed: 4 perspectives (questioner, simplifier, operator, ergonomist) * perf: improve Docker layer caching for faster builds Split dependency installation from source copy: 1. Copy manifests first (pyproject.toml, uv.lock) 2. Install deps with --no-install-project (cached layer) 3. Copy source code 4. Install project with --no-deps (fast) This ensures source code changes don't invalidate the expensive ~2GB dependency installation layer. * feat: add MLX roadmap + fix macOS torch compatibility - Add Native Apple Silicon (MLX) as 100-star milestone - Gate cu128 torch source to Linux/Windows only - macOS falls back to PyPI CPU wheels This fixes uv-based installation on macOS which was broken by the cu128 index (no Darwin wheels available). * docs: adjust MLX milestone to 750 stars (building blocks exist) * chore: bump murmurai-core to 1.0.2 (CUDA 12.8, macOS support) * fix: Windows CUDA detection and speaker/confidence bugs - deps.py: Detect CPU-only PyTorch and recommend --torch-backend=auto for Windows - README.md: Add Windows install section with uv pip --torch-backend=auto - pyproject.toml: Remove unnecessary torch uv.sources (API doesn't depend on torch), update murmurai-core>=1.0.4 - transcriber.py: Fix phantom speaker label and hardcoded 0.85 confidence when diarization/word_timestamps disabled * fix: use --frozen in pre-commit hooks to prevent file modifications
1 parent 07b2abb commit 9f061a9

5 files changed

Lines changed: 100 additions & 55 deletions

File tree

.pre-commit-config.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,14 @@ repos:
2020
hooks:
2121
- id: mypy
2222
name: mypy
23-
entry: uv run mypy src/
23+
entry: uv run --frozen mypy src/
2424
language: system
2525
types: [python]
2626
pass_filenames: false
2727

2828
- id: pytest
2929
name: pytest
30-
entry: uv run pytest tests/ -x -q
30+
entry: uv run --frozen pytest tests/ -x -q
3131
language: system
3232
types: [python]
3333
pass_filenames: false

README.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,21 @@ docker compose up
105105

106106
Requires NVIDIA Container Toolkit. Set `MURMURAI_API_KEY` in environment for production.
107107

108+
### Windows Install
109+
110+
Windows requires PyTorch with CUDA from PyTorch's index (PyPI only has CPU wheels for Windows).
111+
112+
```powershell
113+
# One command (auto-detects CUDA):
114+
uv pip install murmurai --torch-backend=auto
115+
116+
# Or manually:
117+
uv pip install torch torchaudio --index-url https://download.pytorch.org/whl/cu128
118+
uv pip install murmurai
119+
```
120+
121+
If you see "PyTorch is CPU-only", reinstall with `--torch-backend=auto` or use the manual method above.
122+
108123
The API starts at `http://localhost:8880`. Swagger docs at `/docs`.
109124

110125
### First Transcription

pyproject.toml

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
44

55
[project]
66
name = "murmurai"
7-
version = "1.0.4-rc.1"
7+
version = "1.0.3"
88
description = "GPU-powered transcription API with speaker diarization"
99
readme = "README.md"
1010
requires-python = "==3.12.*"
@@ -25,7 +25,7 @@ classifiers = [
2525
]
2626

2727
dependencies = [
28-
"murmurai-core>=1.0.2",
28+
"murmurai-core>=1.0.5",
2929
"fastapi>=0.110",
3030
"uvicorn>=0.25",
3131
"pydantic>=2.5",
@@ -52,17 +52,6 @@ packages = ["src/murmurai_server"]
5252

5353
[tool.uv]
5454
managed = true
55-
index-strategy = "unsafe-best-match"
56-
57-
[tool.uv.sources]
58-
# Use CUDA 12.8 wheels on Linux/Windows, PyPI CPU wheels on macOS
59-
torch = {index = "pytorch", marker = "sys_platform != 'darwin'"}
60-
torchaudio = {index = "pytorch", marker = "sys_platform != 'darwin'"}
61-
62-
[[tool.uv.index]]
63-
name = "pytorch"
64-
url = "https://download.pytorch.org/whl/cu128"
65-
priority = "supplemental"
6655

6756
[tool.ruff]
6857
target-version = "py312"

src/murmurai_server/deps.py

Lines changed: 43 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,25 @@ class DependencyStatus:
3838
"Windows": """\
3939
CUDA toolkit required. Install from:
4040
https://developer.nvidia.com/cuda-downloads""",
41+
},
42+
"cuda_pytorch": {
43+
"Linux": """\
44+
PyTorch was installed without CUDA support. Reinstall with CUDA:
45+
uv pip uninstall torch torchaudio -y
46+
uv pip install torch torchaudio --index-url https://download.pytorch.org/whl/cu128""",
47+
"Darwin": """\
48+
PyTorch on macOS does not support CUDA. MurmurAI requires NVIDIA GPU.
49+
Consider using a Linux machine or cloud GPU instance (AWS, GCP, Lambda Labs).""",
50+
"Windows": """\
51+
PyTorch was installed without CUDA support (PyPI only has CPU wheels for Windows).
52+
53+
Reinstall with CUDA auto-detection:
54+
uv pip install murmurai --torch-backend=auto
55+
56+
Or manually:
57+
uv pip uninstall torch torchaudio
58+
uv pip install torch torchaudio --index-url https://download.pytorch.org/whl/cu128
59+
uv pip install murmurai""",
4160
},
4261
"cudnn": {
4362
"Linux": """\
@@ -91,12 +110,21 @@ def check_cuda() -> DependencyStatus:
91110
details=f"{device_count} GPU(s) available",
92111
)
93112
else:
94-
return DependencyStatus(
95-
name="CUDA",
96-
available=False,
97-
error="PyTorch cannot detect CUDA",
98-
install_hint=get_install_hint("cuda"),
99-
)
113+
# Check if PyTorch is CPU-only (no CUDA compiled in)
114+
if torch.version.cuda is None:
115+
return DependencyStatus(
116+
name="CUDA",
117+
available=False,
118+
error="PyTorch is CPU-only (no CUDA support compiled in)",
119+
install_hint=get_install_hint("cuda_pytorch"),
120+
)
121+
else:
122+
return DependencyStatus(
123+
name="CUDA",
124+
available=False,
125+
error="PyTorch cannot detect CUDA",
126+
install_hint=get_install_hint("cuda"),
127+
)
100128
except ImportError:
101129
return DependencyStatus(
102130
name="CUDA",
@@ -365,10 +393,15 @@ def print_dependency_report(statuses: list[DependencyStatus]) -> bool:
365393
else:
366394
print(" [ERROR] Required dependencies missing!")
367395
print()
368-
print(" Run the install script to fix:")
369-
print(
370-
" curl -fsSL https://raw.githubusercontent.com/namastexlabs/murmurai/main/get-murmurai.sh | bash"
371-
)
396+
system = get_platform()
397+
if system == "Windows":
398+
print(" See the hints above to fix dependencies.")
399+
print()
400+
print(" For full instructions, visit:")
401+
print(" https://github.com/namastexlabs/murmurai#windows-install")
402+
else:
403+
print(" Run the install script to fix:")
404+
print(" curl -fsSL https://install.namastex.ai/get-murmurai.sh | bash")
372405
print()
373406
print(" Or continue anyway with: murmurai --force")
374407

src/murmurai_server/transcriber.py

Lines changed: 38 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -425,7 +425,13 @@ def transcribe(
425425
if progress_callback:
426426
progress_callback(0.95) # Diarization done
427427

428-
formatted = format_result(result, detected_language, speaker_embeddings)
428+
formatted = format_result(
429+
result,
430+
detected_language,
431+
speaker_embeddings,
432+
speaker_labels=options.speaker_labels,
433+
word_timestamps=options.word_timestamps,
434+
)
429435

430436
if progress_callback:
431437
progress_callback(1.0) # Complete
@@ -449,13 +455,17 @@ def format_result(
449455
result: dict[str, Any],
450456
language: str,
451457
speaker_embeddings: dict[str, list[float]] | None = None,
458+
speaker_labels: bool = False,
459+
word_timestamps: bool = False,
452460
) -> dict[str, Any]:
453461
"""Format result to API response format.
454462
455463
Args:
456464
result: Raw result with segments.
457465
language: Detected/specified language code.
458466
speaker_embeddings: Optional speaker embedding vectors.
467+
speaker_labels: Whether speaker diarization was requested.
468+
word_timestamps: Whether word-level timestamps were requested.
459469
460470
Returns:
461471
Formatted transcript with words and utterances.
@@ -464,51 +474,46 @@ def format_result(
464474
utterances: list[dict[str, Any]] = []
465475

466476
for segment in result.get("segments", []):
467-
speaker = segment.get("speaker", "A")
477+
# Only include speaker if diarization was requested
478+
speaker = segment.get("speaker") if speaker_labels else None
468479
utterance_words: list[dict[str, Any]] = []
469480

470481
for word in segment.get("words", []):
471-
word_data = {
482+
word_data: dict[str, Any] = {
472483
"text": word.get("word", ""),
473484
"start": int(word.get("start", 0) * 1000), # Convert to ms
474485
"end": int(word.get("end", 0) * 1000),
475486
"confidence": word.get("score", 0.0),
476-
"speaker": speaker,
477487
}
488+
# Only include speaker if diarization was requested and speaker exists
489+
if speaker:
490+
word_data["speaker"] = speaker
478491
words.append(word_data)
479492
utterance_words.append(word_data)
480493

481494
# Build utterance from segment
495+
utterance: dict[str, Any] = {
496+
"text": segment.get("text", "").strip(),
497+
"start": int(segment.get("start", 0) * 1000),
498+
"end": int(segment.get("end", 0) * 1000),
499+
"words": utterance_words,
500+
}
501+
502+
# Only include speaker if diarization was requested and speaker exists
503+
if speaker:
504+
utterance["speaker"] = speaker
505+
506+
# Only include confidence if we have word-level data
482507
if utterance_words:
483-
avg_confidence = sum(w["confidence"] for w in utterance_words) / len(utterance_words)
484-
else:
485-
# Without word-level alignment, use 0.85 baseline (Whisper is generally accurate)
486-
avg_confidence = 0.85
508+
utterance["confidence"] = sum(w["confidence"] for w in utterance_words) / len(
509+
utterance_words
510+
)
487511

488-
utterances.append(
489-
{
490-
"speaker": speaker,
491-
"text": segment.get("text", "").strip(),
492-
"start": int(segment.get("start", 0) * 1000),
493-
"end": int(segment.get("end", 0) * 1000),
494-
"confidence": avg_confidence,
495-
"words": utterance_words,
496-
}
497-
)
512+
utterances.append(utterance)
498513

499514
# Calculate overall metrics
500515
full_text = " ".join(s.get("text", "").strip() for s in result.get("segments", []))
501516

502-
# Confidence: use word-level if available, otherwise estimate from utterance count
503-
if words:
504-
total_confidence = sum(w["confidence"] for w in words) / len(words)
505-
elif utterances:
506-
# Without word-level alignment, use 0.85 as baseline (Whisper is generally accurate)
507-
# This indicates "transcription worked but no word-level confidence available"
508-
total_confidence = 0.85
509-
else:
510-
total_confidence = 0.0
511-
512517
# Audio duration: use word-level if available, otherwise use utterance end times
513518
if words:
514519
audio_duration = max((w["end"] for w in words), default=0)
@@ -517,15 +522,18 @@ def format_result(
517522
else:
518523
audio_duration = 0
519524

520-
formatted = {
525+
formatted: dict[str, Any] = {
521526
"text": full_text,
522527
"words": words,
523528
"utterances": utterances,
524-
"confidence": total_confidence,
525529
"audio_duration": audio_duration,
526530
"language_code": language,
527531
}
528532

533+
# Only include confidence if we have word-level data
534+
if words:
535+
formatted["confidence"] = sum(w["confidence"] for w in words) / len(words)
536+
529537
# Include speaker embeddings if available
530538
if speaker_embeddings:
531539
formatted["speaker_embeddings"] = speaker_embeddings

0 commit comments

Comments
 (0)