fix: Windows CUDA detection and speaker/confidence bugs (#9)

namastex888 · web-flow · commit 9f061a9d6381 · 2025-12-17T00:37:27.000-03:00
* feat: optimize CUDA/cuDNN stack and dependency management

- Upgrade to CUDA 12.8 + cuDNN 9 (7.6% faster vs 12.6)
- Remove cuDNN 8 detection (ctranslate2 &gt;= 4.5.0 requires cuDNN 9)
- Remove redundant torch/torchaudio/torchvision deps (inherit from core)
- Tighten dependency floors to tested versions
- Fix Dockerfile to use uv.lock for reproducible builds

Dependency floor changes:
- fastapi: &gt;=0.100 → &gt;=0.110
- uvicorn: &gt;=0.20 → &gt;=0.25
- pydantic: &gt;=2.0 → &gt;=2.5
- pydantic-settings: &gt;=2.0 → &gt;=2.3
- httpx: &gt;=0.25 → &gt;=0.27
- aiosqlite: &gt;=0.19 → &gt;=0.20

Council reviewed: 4 perspectives (questioner, simplifier, operator, ergonomist)

* perf: improve Docker layer caching for faster builds

Split dependency installation from source copy:
1. Copy manifests first (pyproject.toml, uv.lock)
2. Install deps with --no-install-project (cached layer)
3. Copy source code
4. Install project with --no-deps (fast)

This ensures source code changes don't invalidate the expensive
~2GB dependency installation layer.

* feat: add MLX roadmap + fix macOS torch compatibility

- Add Native Apple Silicon (MLX) as 100-star milestone
- Gate cu128 torch source to Linux/Windows only
- macOS falls back to PyPI CPU wheels

This fixes uv-based installation on macOS which was broken
by the cu128 index (no Darwin wheels available).

* docs: adjust MLX milestone to 750 stars (building blocks exist)

* chore: bump murmurai-core to 1.0.2 (CUDA 12.8, macOS support)

* fix: Windows CUDA detection and speaker/confidence bugs

- deps.py: Detect CPU-only PyTorch and recommend --torch-backend=auto for Windows
- README.md: Add Windows install section with uv pip --torch-backend=auto
- pyproject.toml: Remove unnecessary torch uv.sources (API doesn't depend on torch),
  update murmurai-core&gt;=1.0.4
- transcriber.py: Fix phantom speaker label and hardcoded 0.85 confidence when
  diarization/word_timestamps disabled

* fix: use --frozen in pre-commit hooks to prevent file modifications
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -20,14 +20,14 @@ repos:
     hooks:
       - id: mypy
         name: mypy
-        entry: uv run mypy src/
+        entry: uv run --frozen mypy src/
         language: system
         types: [python]
         pass_filenames: false
 
       - id: pytest
         name: pytest
-        entry: uv run pytest tests/ -x -q
+        entry: uv run --frozen pytest tests/ -x -q
         language: system
         types: [python]
         pass_filenames: false
diff --git a/README.md b/README.md
@@ -105,6 +105,21 @@ docker compose up
 
 Requires NVIDIA Container Toolkit. Set `MURMURAI_API_KEY` in environment for production.
 
+### Windows Install
+
+Windows requires PyTorch with CUDA from PyTorch's index (PyPI only has CPU wheels for Windows).
+
+```powershell
+# One command (auto-detects CUDA):
+uv pip install murmurai --torch-backend=auto
+
+# Or manually:
+uv pip install torch torchaudio --index-url https://download.pytorch.org/whl/cu128
+uv pip install murmurai
+```
+
+If you see "PyTorch is CPU-only", reinstall with `--torch-backend=auto` or use the manual method above.
+
 The API starts at `http://localhost:8880`. Swagger docs at `/docs`.
 
 ### First Transcription
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "murmurai"
-version = "1.0.4-rc.1"
+version = "1.0.3"
 description = "GPU-powered transcription API with speaker diarization"
 readme = "README.md"
 requires-python = "==3.12.*"
@@ -25,7 +25,7 @@ classifiers = [
 ]
 
 dependencies = [
-    "murmurai-core>=1.0.2",
+    "murmurai-core>=1.0.5",
     "fastapi>=0.110",
     "uvicorn>=0.25",
     "pydantic>=2.5",
@@ -52,17 +52,6 @@ packages = ["src/murmurai_server"]
 
 [tool.uv]
 managed = true
-index-strategy = "unsafe-best-match"
-
-[tool.uv.sources]
-# Use CUDA 12.8 wheels on Linux/Windows, PyPI CPU wheels on macOS
-torch = {index = "pytorch", marker = "sys_platform != 'darwin'"}
-torchaudio = {index = "pytorch", marker = "sys_platform != 'darwin'"}
-
-[[tool.uv.index]]
-name = "pytorch"
-url = "https://download.pytorch.org/whl/cu128"
-priority = "supplemental"
 
 [tool.ruff]
 target-version = "py312"
diff --git a/src/murmurai_server/deps.py b/src/murmurai_server/deps.py
@@ -38,6 +38,25 @@ class DependencyStatus:
         "Windows": """\
 CUDA toolkit required. Install from:
   https://developer.nvidia.com/cuda-downloads""",
+    },
+    "cuda_pytorch": {
+        "Linux": """\
+PyTorch was installed without CUDA support. Reinstall with CUDA:
+  uv pip uninstall torch torchaudio -y
+  uv pip install torch torchaudio --index-url https://download.pytorch.org/whl/cu128""",
+        "Darwin": """\
+PyTorch on macOS does not support CUDA. MurmurAI requires NVIDIA GPU.
+Consider using a Linux machine or cloud GPU instance (AWS, GCP, Lambda Labs).""",
+        "Windows": """\
+PyTorch was installed without CUDA support (PyPI only has CPU wheels for Windows).
+
+Reinstall with CUDA auto-detection:
+  uv pip install murmurai --torch-backend=auto
+
+Or manually:
+  uv pip uninstall torch torchaudio
+  uv pip install torch torchaudio --index-url https://download.pytorch.org/whl/cu128
+  uv pip install murmurai""",
     },
     "cudnn": {
         "Linux": """\
@@ -91,12 +110,21 @@ def check_cuda() -> DependencyStatus:
                 details=f"{device_count} GPU(s) available",
             )
         else:
-            return DependencyStatus(
-                name="CUDA",
-                available=False,
-                error="PyTorch cannot detect CUDA",
-                install_hint=get_install_hint("cuda"),
-            )
+            # Check if PyTorch is CPU-only (no CUDA compiled in)
+            if torch.version.cuda is None:
+                return DependencyStatus(
+                    name="CUDA",
+                    available=False,
+                    error="PyTorch is CPU-only (no CUDA support compiled in)",
+                    install_hint=get_install_hint("cuda_pytorch"),
+                )
+            else:
+                return DependencyStatus(
+                    name="CUDA",
+                    available=False,
+                    error="PyTorch cannot detect CUDA",
+                    install_hint=get_install_hint("cuda"),
+                )
     except ImportError:
         return DependencyStatus(
             name="CUDA",
@@ -365,10 +393,15 @@ def print_dependency_report(statuses: list[DependencyStatus]) -> bool:
     else:
         print("  [ERROR] Required dependencies missing!")
         print()
-        print("  Run the install script to fix:")
-        print(
-            "    curl -fsSL https://raw.githubusercontent.com/namastexlabs/murmurai/main/get-murmurai.sh | bash"
-        )
+        system = get_platform()
+        if system == "Windows":
+            print("  See the hints above to fix dependencies.")
+            print()
+            print("  For full instructions, visit:")
+            print("    https://github.com/namastexlabs/murmurai#windows-install")
+        else:
+            print("  Run the install script to fix:")
+            print("    curl -fsSL https://install.namastex.ai/get-murmurai.sh | bash")
         print()
         print("  Or continue anyway with: murmurai --force")
 
diff --git a/src/murmurai_server/transcriber.py b/src/murmurai_server/transcriber.py
@@ -425,7 +425,13 @@ def transcribe(
     if progress_callback:
         progress_callback(0.95)  # Diarization done
 
-    formatted = format_result(result, detected_language, speaker_embeddings)
+    formatted = format_result(
+        result,
+        detected_language,
+        speaker_embeddings,
+        speaker_labels=options.speaker_labels,
+        word_timestamps=options.word_timestamps,
+    )
 
     if progress_callback:
         progress_callback(1.0)  # Complete
@@ -449,13 +455,17 @@ def format_result(
     result: dict[str, Any],
     language: str,
     speaker_embeddings: dict[str, list[float]] | None = None,
+    speaker_labels: bool = False,
+    word_timestamps: bool = False,
 ) -> dict[str, Any]:
     """Format result to API response format.
 
     Args:
         result: Raw result with segments.
         language: Detected/specified language code.
         speaker_embeddings: Optional speaker embedding vectors.
+        speaker_labels: Whether speaker diarization was requested.
+        word_timestamps: Whether word-level timestamps were requested.
 
     Returns:
         Formatted transcript with words and utterances.
@@ -464,51 +474,46 @@ def format_result(
     utterances: list[dict[str, Any]] = []
 
     for segment in result.get("segments", []):
-        speaker = segment.get("speaker", "A")
+        # Only include speaker if diarization was requested
+        speaker = segment.get("speaker") if speaker_labels else None
         utterance_words: list[dict[str, Any]] = []
 
         for word in segment.get("words", []):
-            word_data = {
+            word_data: dict[str, Any] = {
                 "text": word.get("word", ""),
                 "start": int(word.get("start", 0) * 1000),  # Convert to ms
                 "end": int(word.get("end", 0) * 1000),
                 "confidence": word.get("score", 0.0),
-                "speaker": speaker,
             }
+            # Only include speaker if diarization was requested and speaker exists
+            if speaker:
+                word_data["speaker"] = speaker
             words.append(word_data)
             utterance_words.append(word_data)
 
         # Build utterance from segment
+        utterance: dict[str, Any] = {
+            "text": segment.get("text", "").strip(),
+            "start": int(segment.get("start", 0) * 1000),
+            "end": int(segment.get("end", 0) * 1000),
+            "words": utterance_words,
+        }
+
+        # Only include speaker if diarization was requested and speaker exists
+        if speaker:
+            utterance["speaker"] = speaker
+
+        # Only include confidence if we have word-level data
         if utterance_words:
-            avg_confidence = sum(w["confidence"] for w in utterance_words) / len(utterance_words)
-        else:
-            # Without word-level alignment, use 0.85 baseline (Whisper is generally accurate)
-            avg_confidence = 0.85
+            utterance["confidence"] = sum(w["confidence"] for w in utterance_words) / len(
+                utterance_words
+            )
 
-        utterances.append(
-            {
-                "speaker": speaker,
-                "text": segment.get("text", "").strip(),
-                "start": int(segment.get("start", 0) * 1000),
-                "end": int(segment.get("end", 0) * 1000),
-                "confidence": avg_confidence,
-                "words": utterance_words,
-            }
-        )
+        utterances.append(utterance)
 
     # Calculate overall metrics
     full_text = " ".join(s.get("text", "").strip() for s in result.get("segments", []))
 
-    # Confidence: use word-level if available, otherwise estimate from utterance count
-    if words:
-        total_confidence = sum(w["confidence"] for w in words) / len(words)
-    elif utterances:
-        # Without word-level alignment, use 0.85 as baseline (Whisper is generally accurate)
-        # This indicates "transcription worked but no word-level confidence available"
-        total_confidence = 0.85
-    else:
-        total_confidence = 0.0
-
     # Audio duration: use word-level if available, otherwise use utterance end times
     if words:
         audio_duration = max((w["end"] for w in words), default=0)
@@ -517,15 +522,18 @@ def format_result(
     else:
         audio_duration = 0
 
-    formatted = {
+    formatted: dict[str, Any] = {
         "text": full_text,
         "words": words,
         "utterances": utterances,
-        "confidence": total_confidence,
         "audio_duration": audio_duration,
         "language_code": language,
     }
 
+    # Only include confidence if we have word-level data
+    if words:
+        formatted["confidence"] = sum(w["confidence"] for w in words) / len(words)
+
     # Include speaker embeddings if available
     if speaker_embeddings:
         formatted["speaker_embeddings"] = speaker_embeddings