fix: Auto-fix embedding cache path mismatch (HuggingFace -> sentence-transformers format)

anhmtk · anhmtk · commit 87a4a2f5853a · 2026-01-12T20:37:26.000+07:00
Fixed critical embedding cache issue:

- Model downloaded in HuggingFace format but SentenceTransformer looks for sentence-transformers format

- This caused low retrieval quality (max_similarity=0.037, distance=0.963)

Solution:

- Created fix_embedding_cache.py to auto-fix path mismatch

- Auto-called in EmbeddingService.__init__

- Copies/symlinks from HuggingFace format to sentence-transformers format

- Non-critical: system continues if fix fails

Next steps:

- Test retrieval quality after fix (should improve from 0.037 to &gt;0.1)

- Monitor latency (11s overhead still needs investigation)
diff --git a/backend/utils/fix_embedding_cache.py b/backend/utils/fix_embedding_cache.py
@@ -0,0 +1,133 @@
+"""
+Fix Embedding Model Cache Path Mismatch
+
+CRITICAL: This script fixes the cache path mismatch issue where:
+- Model is downloaded in HuggingFace format: /app/hf_cache/models--sentence-transformers--paraphrase-multilingual-MiniLM-L12-v2
+- But SentenceTransformer looks for: /app/hf_cache/sentence_transformers/paraphrase-multilingual-MiniLM-L12-v2
+
+Solution: Create symlink or copy from HuggingFace format to sentence-transformers format
+"""
+
+import os
+import shutil
+import logging
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+
+def fix_embedding_model_cache(model_name: str = "paraphrase-multilingual-MiniLM-L12-v2") -> bool:
+    """
+    Fix embedding model cache path mismatch.
+    
+    Args:
+        model_name: Name of the model to fix
+        
+    Returns:
+        True if fix was successful or not needed, False if failed
+    """
+    try:
+        cache_base = Path("/app/hf_cache")
+        if not cache_base.exists():
+            logger.warning(f"⚠️ Cache base directory does not exist: {cache_base}")
+            return False
+        
+        # Model name variations
+        model_name_safe = model_name.replace("/", "_")
+        model_name_hf = model_name.replace("/", "--")
+        
+        # HuggingFace format path (where model might be downloaded)
+        hf_paths = [
+            cache_base / f"models--sentence-transformers--{model_name_hf}",
+            cache_base / "hub" / f"models--sentence-transformers--{model_name_hf}",
+            cache_base / f"models--{model_name_hf}",
+            cache_base / "hub" / f"models--{model_name_hf}",
+        ]
+        
+        # Sentence-transformers format path (where SentenceTransformer looks)
+        st_path = cache_base / "sentence_transformers" / model_name_safe
+        
+        # Find HuggingFace format cache
+        hf_source = None
+        for hf_path in hf_paths:
+            if hf_path.exists():
+                # Verify it has model files
+                if any(hf_path.rglob("*.json")) or any(hf_path.rglob("*.bin")) or any(hf_path.rglob("*.safetensors")):
+                    hf_source = hf_path
+                    logger.info(f"✅ Found HuggingFace format cache: {hf_source}")
+                    break
+        
+        if not hf_source:
+            logger.info("ℹ️ HuggingFace format cache not found - model may not be downloaded yet")
+            return True  # Not an error, just not downloaded yet
+        
+        # Check if sentence-transformers format already exists
+        if st_path.exists():
+            logger.info(f"✅ Sentence-transformers format cache already exists: {st_path}")
+            return True
+        
+        # Create sentence_transformers directory
+        st_path.parent.mkdir(parents=True, exist_ok=True)
+        
+        # CRITICAL: For HuggingFace format, we need to extract the actual model files
+        # HuggingFace cache structure: models--{name}/snapshots/{hash}/model files
+        # Sentence-transformers expects: sentence_transformers/{name}/model files directly
+        
+        # Check if it's HuggingFace format with snapshots
+        snapshots_dir = hf_source / "snapshots"
+        if snapshots_dir.exists():
+            # Find the latest snapshot
+            snapshots = sorted(snapshots_dir.iterdir(), key=lambda p: p.stat().st_mtime, reverse=True)
+            if snapshots:
+                latest_snapshot = snapshots[0]
+                logger.info(f"📦 Found HuggingFace snapshot: {latest_snapshot}")
+                
+                # Copy model files from snapshot to sentence-transformers format
+                logger.info(f"📦 Copying model files from HuggingFace format to sentence-transformers format...")
+                logger.info(f"   Source: {latest_snapshot}")
+                logger.info(f"   Destination: {st_path}")
+                
+                # Copy all files from snapshot
+                shutil.copytree(latest_snapshot, st_path, dirs_exist_ok=True)
+                logger.info(f"✅ Successfully copied model files to: {st_path}")
+                return True
+            else:
+                logger.warning(f"⚠️ No snapshots found in: {snapshots_dir}")
+        else:
+            # Direct model files (not HuggingFace snapshot format)
+            # Try to copy or symlink
+            logger.info(f"📦 Model files are in direct format, creating symlink...")
+            logger.info(f"   Source: {hf_source}")
+            logger.info(f"   Destination: {st_path}")
+            
+            try:
+                # Try symlink first (more efficient)
+                if not st_path.exists():
+                    os.symlink(hf_source, st_path)
+                    logger.info(f"✅ Created symlink: {st_path} -> {hf_source}")
+                    return True
+            except OSError as e:
+                # Symlink failed (might not be supported on all systems)
+                logger.warning(f"⚠️ Symlink failed: {e}, trying copy instead...")
+                shutil.copytree(hf_source, st_path, dirs_exist_ok=True)
+                logger.info(f"✅ Copied model files to: {st_path}")
+                return True
+        
+        return False
+        
+    except Exception as e:
+        logger.error(f"❌ Failed to fix embedding cache: {e}")
+        return False
+
+
+if __name__ == "__main__":
+    # Setup logging
+    logging.basicConfig(level=logging.INFO)
+    
+    # Fix cache
+    success = fix_embedding_model_cache()
+    if success:
+        print("✅ Embedding cache fix completed successfully")
+    else:
+        print("❌ Embedding cache fix failed - check logs for details")
+
diff --git a/backend/vector_db/embeddings.py b/backend/vector_db/embeddings.py
@@ -108,6 +108,23 @@ def __init__(self, model_name: str = "paraphrase-multilingual-MiniLM-L12-v2"):
                     except Exception as delete_error:
                         logger.warning(f"⚠️ Could not delete old model cache: {delete_error}")
             
+            # CRITICAL: Fix cache path mismatch (HuggingFace format -> sentence-transformers format)
+            # This fixes the issue where model is downloaded in HuggingFace format but SentenceTransformer looks for sentence-transformers format
+            try:
+                from backend.utils.fix_embedding_cache import fix_embedding_model_cache
+                logger.info("🔧 Attempting to fix embedding cache path mismatch...")
+                fix_success = fix_embedding_model_cache(model_name)
+                if fix_success:
+                    logger.info("✅ Embedding cache path fix completed")
+                    # Re-verify cache after fix
+                    cache_status = self.model_manager.verify_cache_exists()
+                else:
+                    logger.warning("⚠️ Embedding cache path fix failed - will continue with normal flow")
+            except ImportError:
+                logger.debug("fix_embedding_cache module not available, skipping cache fix")
+            except Exception as fix_error:
+                logger.warning(f"⚠️ Cache fix error (non-critical): {fix_error}")
+            
             # Try to copy model from image cache to persistent volume if needed
             if not cache_status.model_files_found:
                 logger.info("⚠️ Model not found in persistent cache, attempting to copy from image cache...")
diff --git a/docs/EMBEDDING_CACHE_FIX.md b/docs/EMBEDDING_CACHE_FIX.md
@@ -0,0 +1,60 @@
+# Embedding Cache Path Mismatch - Fix Guide
+
+## Vấn Đề
+
+Model `paraphrase-multilingual-MiniLM-L12-v2` được download ở format HuggingFace:
+```
+/app/hf_cache/models--sentence-transformers--paraphrase-multilingual-MiniLM-L12-v2/
+```
+
+Nhưng `SentenceTransformer` tìm ở format sentence-transformers:
+```
+/app/hf_cache/sentence_transformers/paraphrase-multilingual-MiniLM-L12-v2/
+```
+
+**Hậu quả:**
+- Embeddings không hiệu quả
+- Retrieval quality cực thấp: `max_similarity=0.037 < 0.1`
+- High average distance: `0.963`
+
+## Giải Pháp
+
+### 1. Auto-Fix trong EmbeddingService
+
+Script `backend/utils/fix_embedding_cache.py` tự động:
+- Tìm model trong HuggingFace format
+- Copy/symlink sang sentence-transformers format
+- Được gọi tự động khi `EmbeddingService` khởi tạo
+
+### 2. Manual Fix (nếu cần)
+
+```python
+from backend.utils.fix_embedding_cache import fix_embedding_model_cache
+
+# Fix cache path mismatch
+success = fix_embedding_model_cache("paraphrase-multilingual-MiniLM-L12-v2")
+```
+
+### 3. Verify Cache
+
+```python
+from backend.utils.model_cache import verify_model_cache
+
+status = verify_model_cache("paraphrase-multilingual-MiniLM-L12-v2")
+print(f"Model files found: {status.model_files_found}")
+print(f"Cache path: {status.path}")
+```
+
+## Testing
+
+Sau khi fix, verify:
+1. Model được load từ cache (không download lại)
+2. Embedding similarity > 0.1 (thay vì 0.037)
+3. Average distance < 0.5 (thay vì 0.963)
+
+## Notes
+
+- Fix được gọi tự động trong `EmbeddingService.__init__`
+- Nếu fix fail, system vẫn hoạt động bình thường (non-critical)
+- Logs sẽ hiển thị status của cache fix
+