Skip to content

Commit 87a4a2f

Browse files
committed
fix: Auto-fix embedding cache path mismatch (HuggingFace -> sentence-transformers format)
Fixed critical embedding cache issue: - Model downloaded in HuggingFace format but SentenceTransformer looks for sentence-transformers format - This caused low retrieval quality (max_similarity=0.037, distance=0.963) Solution: - Created fix_embedding_cache.py to auto-fix path mismatch - Auto-called in EmbeddingService.__init__ - Copies/symlinks from HuggingFace format to sentence-transformers format - Non-critical: system continues if fix fails Next steps: - Test retrieval quality after fix (should improve from 0.037 to >0.1) - Monitor latency (11s overhead still needs investigation)
1 parent 2a16a5b commit 87a4a2f

3 files changed

Lines changed: 210 additions & 0 deletions

File tree

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
"""
2+
Fix Embedding Model Cache Path Mismatch
3+
4+
CRITICAL: This script fixes the cache path mismatch issue where:
5+
- Model is downloaded in HuggingFace format: /app/hf_cache/models--sentence-transformers--paraphrase-multilingual-MiniLM-L12-v2
6+
- But SentenceTransformer looks for: /app/hf_cache/sentence_transformers/paraphrase-multilingual-MiniLM-L12-v2
7+
8+
Solution: Create symlink or copy from HuggingFace format to sentence-transformers format
9+
"""
10+
11+
import os
12+
import shutil
13+
import logging
14+
from pathlib import Path
15+
16+
logger = logging.getLogger(__name__)
17+
18+
19+
def fix_embedding_model_cache(model_name: str = "paraphrase-multilingual-MiniLM-L12-v2") -> bool:
20+
"""
21+
Fix embedding model cache path mismatch.
22+
23+
Args:
24+
model_name: Name of the model to fix
25+
26+
Returns:
27+
True if fix was successful or not needed, False if failed
28+
"""
29+
try:
30+
cache_base = Path("/app/hf_cache")
31+
if not cache_base.exists():
32+
logger.warning(f"⚠️ Cache base directory does not exist: {cache_base}")
33+
return False
34+
35+
# Model name variations
36+
model_name_safe = model_name.replace("/", "_")
37+
model_name_hf = model_name.replace("/", "--")
38+
39+
# HuggingFace format path (where model might be downloaded)
40+
hf_paths = [
41+
cache_base / f"models--sentence-transformers--{model_name_hf}",
42+
cache_base / "hub" / f"models--sentence-transformers--{model_name_hf}",
43+
cache_base / f"models--{model_name_hf}",
44+
cache_base / "hub" / f"models--{model_name_hf}",
45+
]
46+
47+
# Sentence-transformers format path (where SentenceTransformer looks)
48+
st_path = cache_base / "sentence_transformers" / model_name_safe
49+
50+
# Find HuggingFace format cache
51+
hf_source = None
52+
for hf_path in hf_paths:
53+
if hf_path.exists():
54+
# Verify it has model files
55+
if any(hf_path.rglob("*.json")) or any(hf_path.rglob("*.bin")) or any(hf_path.rglob("*.safetensors")):
56+
hf_source = hf_path
57+
logger.info(f"✅ Found HuggingFace format cache: {hf_source}")
58+
break
59+
60+
if not hf_source:
61+
logger.info("ℹ️ HuggingFace format cache not found - model may not be downloaded yet")
62+
return True # Not an error, just not downloaded yet
63+
64+
# Check if sentence-transformers format already exists
65+
if st_path.exists():
66+
logger.info(f"✅ Sentence-transformers format cache already exists: {st_path}")
67+
return True
68+
69+
# Create sentence_transformers directory
70+
st_path.parent.mkdir(parents=True, exist_ok=True)
71+
72+
# CRITICAL: For HuggingFace format, we need to extract the actual model files
73+
# HuggingFace cache structure: models--{name}/snapshots/{hash}/model files
74+
# Sentence-transformers expects: sentence_transformers/{name}/model files directly
75+
76+
# Check if it's HuggingFace format with snapshots
77+
snapshots_dir = hf_source / "snapshots"
78+
if snapshots_dir.exists():
79+
# Find the latest snapshot
80+
snapshots = sorted(snapshots_dir.iterdir(), key=lambda p: p.stat().st_mtime, reverse=True)
81+
if snapshots:
82+
latest_snapshot = snapshots[0]
83+
logger.info(f"📦 Found HuggingFace snapshot: {latest_snapshot}")
84+
85+
# Copy model files from snapshot to sentence-transformers format
86+
logger.info(f"📦 Copying model files from HuggingFace format to sentence-transformers format...")
87+
logger.info(f" Source: {latest_snapshot}")
88+
logger.info(f" Destination: {st_path}")
89+
90+
# Copy all files from snapshot
91+
shutil.copytree(latest_snapshot, st_path, dirs_exist_ok=True)
92+
logger.info(f"✅ Successfully copied model files to: {st_path}")
93+
return True
94+
else:
95+
logger.warning(f"⚠️ No snapshots found in: {snapshots_dir}")
96+
else:
97+
# Direct model files (not HuggingFace snapshot format)
98+
# Try to copy or symlink
99+
logger.info(f"📦 Model files are in direct format, creating symlink...")
100+
logger.info(f" Source: {hf_source}")
101+
logger.info(f" Destination: {st_path}")
102+
103+
try:
104+
# Try symlink first (more efficient)
105+
if not st_path.exists():
106+
os.symlink(hf_source, st_path)
107+
logger.info(f"✅ Created symlink: {st_path} -> {hf_source}")
108+
return True
109+
except OSError as e:
110+
# Symlink failed (might not be supported on all systems)
111+
logger.warning(f"⚠️ Symlink failed: {e}, trying copy instead...")
112+
shutil.copytree(hf_source, st_path, dirs_exist_ok=True)
113+
logger.info(f"✅ Copied model files to: {st_path}")
114+
return True
115+
116+
return False
117+
118+
except Exception as e:
119+
logger.error(f"❌ Failed to fix embedding cache: {e}")
120+
return False
121+
122+
123+
if __name__ == "__main__":
124+
# Setup logging
125+
logging.basicConfig(level=logging.INFO)
126+
127+
# Fix cache
128+
success = fix_embedding_model_cache()
129+
if success:
130+
print("✅ Embedding cache fix completed successfully")
131+
else:
132+
print("❌ Embedding cache fix failed - check logs for details")
133+

backend/vector_db/embeddings.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,23 @@ def __init__(self, model_name: str = "paraphrase-multilingual-MiniLM-L12-v2"):
108108
except Exception as delete_error:
109109
logger.warning(f"⚠️ Could not delete old model cache: {delete_error}")
110110

111+
# CRITICAL: Fix cache path mismatch (HuggingFace format -> sentence-transformers format)
112+
# This fixes the issue where model is downloaded in HuggingFace format but SentenceTransformer looks for sentence-transformers format
113+
try:
114+
from backend.utils.fix_embedding_cache import fix_embedding_model_cache
115+
logger.info("🔧 Attempting to fix embedding cache path mismatch...")
116+
fix_success = fix_embedding_model_cache(model_name)
117+
if fix_success:
118+
logger.info("✅ Embedding cache path fix completed")
119+
# Re-verify cache after fix
120+
cache_status = self.model_manager.verify_cache_exists()
121+
else:
122+
logger.warning("⚠️ Embedding cache path fix failed - will continue with normal flow")
123+
except ImportError:
124+
logger.debug("fix_embedding_cache module not available, skipping cache fix")
125+
except Exception as fix_error:
126+
logger.warning(f"⚠️ Cache fix error (non-critical): {fix_error}")
127+
111128
# Try to copy model from image cache to persistent volume if needed
112129
if not cache_status.model_files_found:
113130
logger.info("⚠️ Model not found in persistent cache, attempting to copy from image cache...")

docs/EMBEDDING_CACHE_FIX.md

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
# Embedding Cache Path Mismatch - Fix Guide
2+
3+
## Vấn Đề
4+
5+
Model `paraphrase-multilingual-MiniLM-L12-v2` được download ở format HuggingFace:
6+
```
7+
/app/hf_cache/models--sentence-transformers--paraphrase-multilingual-MiniLM-L12-v2/
8+
```
9+
10+
Nhưng `SentenceTransformer` tìm ở format sentence-transformers:
11+
```
12+
/app/hf_cache/sentence_transformers/paraphrase-multilingual-MiniLM-L12-v2/
13+
```
14+
15+
**Hậu quả:**
16+
- Embeddings không hiệu quả
17+
- Retrieval quality cực thấp: `max_similarity=0.037 < 0.1`
18+
- High average distance: `0.963`
19+
20+
## Giải Pháp
21+
22+
### 1. Auto-Fix trong EmbeddingService
23+
24+
Script `backend/utils/fix_embedding_cache.py` tự động:
25+
- Tìm model trong HuggingFace format
26+
- Copy/symlink sang sentence-transformers format
27+
- Được gọi tự động khi `EmbeddingService` khởi tạo
28+
29+
### 2. Manual Fix (nếu cần)
30+
31+
```python
32+
from backend.utils.fix_embedding_cache import fix_embedding_model_cache
33+
34+
# Fix cache path mismatch
35+
success = fix_embedding_model_cache("paraphrase-multilingual-MiniLM-L12-v2")
36+
```
37+
38+
### 3. Verify Cache
39+
40+
```python
41+
from backend.utils.model_cache import verify_model_cache
42+
43+
status = verify_model_cache("paraphrase-multilingual-MiniLM-L12-v2")
44+
print(f"Model files found: {status.model_files_found}")
45+
print(f"Cache path: {status.path}")
46+
```
47+
48+
## Testing
49+
50+
Sau khi fix, verify:
51+
1. Model được load từ cache (không download lại)
52+
2. Embedding similarity > 0.1 (thay vì 0.037)
53+
3. Average distance < 0.5 (thay vì 0.963)
54+
55+
## Notes
56+
57+
- Fix được gọi tự động trong `EmbeddingService.__init__`
58+
- Nếu fix fail, system vẫn hoạt động bình thường (non-critical)
59+
- Logs sẽ hiển thị status của cache fix
60+

0 commit comments

Comments
 (0)