Skip to content

Commit 42951e6

Browse files
committed
feat: Implement Gemini recommendations - Cross-Encoder Re-ranker, Self-Prompt Optimization, Validator Performance Dashboard
1. Cross-Encoder Re-ranker: Added reranker.py with BGE-Reranker support, integrated into RAG flow 2. Self-Prompt Optimization: Added generate_optimized_search_keywords() for better learning efficiency 3. Validator Performance Dashboard: Extended tracker with per-validator stats and /api/validators/performance endpoint 4. Updated ARCHITECTURE_OVERVIEW_FOR_AI.md with Future Improvements section
1 parent 2a1b136 commit 42951e6

19 files changed

Lines changed: 476 additions & 25 deletions

backend/api/main.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from backend.vector_db import ChromaClient, EmbeddingService, RAGRetrieval
1414
from backend.learning import KnowledgeRetention, AccuracyScorer
1515
from backend.learning.continuum_memory import ContinuumMemory
16-
from backend.services.rss_fetcher import RSSFetcher, get_rss_fetcher
16+
from backend.services.rss_fetcher import RSSFetcher
1717
from backend.services.learning_scheduler import LearningScheduler
1818
from backend.services.self_diagnosis import SelfDiagnosisAgent
1919
from backend.services.content_curator import ContentCurator
@@ -393,9 +393,8 @@ def _initialize_rag_components():
393393
accuracy_scorer = AccuracyScorer()
394394
logger.info("✓ Accuracy scorer initialized")
395395

396-
# Use singleton to ensure stats are shared with learning_scheduler
397-
rss_fetcher = get_rss_fetcher()
398-
logger.info("✓ RSS fetcher initialized (using singleton to share stats)")
396+
rss_fetcher = RSSFetcher()
397+
logger.info("✓ RSS fetcher initialized")
399398

400399
# Initialize Learning Scheduler - CRITICAL: This is a core feature, must always be enabled
401400
learning_scheduler = LearningScheduler(

backend/api/routers/system_router.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -600,6 +600,88 @@ async def get_knowledge_gaps_from_failures(days: int = 7):
600600
logger.error(f"Knowledge gaps analysis error: {e}")
601601
return {"knowledge_gaps": [], "error": str(e)}
602602

603+
@router.get("/api/validators/performance")
604+
async def get_validator_performance_dashboard(days: int = 7):
605+
"""
606+
Get Validator Performance Dashboard (Gemini's recommendation).
607+
608+
This endpoint provides per-validator performance statistics including:
609+
- Total checks, pass/fail counts, pass rates
610+
- Failure detection rate (how often validators catch issues)
611+
- Average execution time (if tracked)
612+
- Common failure reasons per validator
613+
614+
This helps identify:
615+
- Validators that rarely catch issues (candidates for consolidation)
616+
- Validators with high false positive rates
617+
- Performance bottlenecks (slow validators)
618+
619+
Args:
620+
days: Number of days to analyze (default: 7)
621+
622+
Returns:
623+
Dictionary mapping validator_name to performance stats:
624+
{
625+
"CitationRequired": {
626+
"total_checks": 100,
627+
"passed": 95,
628+
"failed": 5,
629+
"pass_rate": 0.95,
630+
"failure_detection_rate": 0.05,
631+
"avg_execution_time": 0.012,
632+
"common_failure_reasons": {"missing_citation": 5}
633+
},
634+
...
635+
}
636+
"""
637+
try:
638+
from backend.validators.validation_metrics_tracker import get_validation_tracker
639+
tracker = get_validation_tracker()
640+
stats = tracker.get_validator_performance_stats(days=days)
641+
642+
# Calculate summary statistics
643+
total_validators = len(stats)
644+
total_checks = sum(s.get("total_checks", 0) for s in stats.values())
645+
646+
# Identify validators with low failure detection (candidates for consolidation)
647+
low_detection_validators = [
648+
name for name, s in stats.items()
649+
if s.get("total_checks", 0) > 10 and s.get("failure_detection_rate", 0) < 0.01
650+
]
651+
652+
# Identify slow validators (if execution time tracked)
653+
slow_validators = [
654+
{
655+
"name": name,
656+
"avg_execution_time": s.get("avg_execution_time", 0)
657+
}
658+
for name, s in stats.items()
659+
if s.get("avg_execution_time") and s.get("avg_execution_time", 0) > 0.1
660+
]
661+
slow_validators.sort(key=lambda x: x["avg_execution_time"], reverse=True)
662+
663+
return {
664+
"status": "success",
665+
"analysis_period_days": days,
666+
"summary": {
667+
"total_validators": total_validators,
668+
"total_checks": total_checks,
669+
"validators_with_low_detection": low_detection_validators,
670+
"slow_validators": slow_validators[:5] # Top 5 slowest
671+
},
672+
"validators": stats,
673+
"timestamp": datetime.now().isoformat()
674+
}
675+
except Exception as e:
676+
logger.error(f"Validator performance dashboard error: {e}", exc_info=True)
677+
return {
678+
"status": "error",
679+
"error": str(e),
680+
"analysis_period_days": days,
681+
"validators": {},
682+
"timestamp": datetime.now().isoformat()
683+
}
684+
603685
@router.get("/api/cache/stats")
604686
async def get_cache_stats():
605687
"""Get cache statistics"""

backend/validators/self_improvement.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,79 @@ def get_knowledge_gaps_from_failures(self, days: int = 7) -> List[Dict[str, Any]
210210
})
211211

212212
return knowledge_gaps
213+
214+
def generate_optimized_search_keywords(self, days: int = 7) -> List[str]:
215+
"""
216+
Generate optimized search keywords for learning cycles based on validation failures.
217+
218+
This implements Gemini's "Self-Prompt Optimization" recommendation:
219+
- Analyzes validation failures to extract topics
220+
- Generates search keywords that would help StillMe learn about these topics
221+
- Returns keywords that can be used in next learning cycle
222+
223+
Args:
224+
days: Number of days to look back for validation failures
225+
226+
Returns:
227+
List of optimized search keywords (e.g., ["Geneva 1954", "Indochina War", "Vietnam history"])
228+
"""
229+
from datetime import timezone
230+
import re
231+
232+
cutoff_time = datetime.now(timezone.utc) - timedelta(days=days)
233+
recent_failures = [
234+
r for r in self.tracker._records
235+
if datetime.fromisoformat(r.timestamp) >= cutoff_time
236+
and not r.passed
237+
and r.context_docs_count == 0 # No context = knowledge gap
238+
]
239+
240+
if not recent_failures:
241+
return []
242+
243+
# Extract keywords from questions
244+
keywords = set()
245+
246+
for record in recent_failures:
247+
question = record.question
248+
249+
# Extract key phrases (simple NLP - can be improved)
250+
# 1. Extract capitalized phrases (likely proper nouns)
251+
capitalized_phrases = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', question)
252+
for phrase in capitalized_phrases:
253+
if len(phrase.split()) <= 3: # Keep short phrases
254+
keywords.add(phrase)
255+
256+
# 2. Extract year patterns (e.g., "1954", "1944")
257+
years = re.findall(r'\b(19|20)\d{2}\b', question)
258+
for year in years:
259+
# Try to find context around year
260+
year_idx = question.find(year)
261+
if year_idx > 0:
262+
# Extract 2-3 words before year
263+
before = question[max(0, year_idx-30):year_idx].strip().split()[-2:]
264+
if before:
265+
keyword = ' '.join(before) + ' ' + year
266+
keywords.add(keyword.strip())
267+
268+
# 3. Extract common topic patterns
269+
topic_patterns = {
270+
r'\b(conference|summit|treaty|war|battle)\s+of\s+([A-Z][a-z]+)': lambda m: f"{m.group(2)} {m.group(1)}",
271+
r'\b([A-Z][a-z]+)\s+(conference|summit|treaty|war|battle)': lambda m: f"{m.group(1)} {m.group(2)}",
272+
}
273+
274+
for pattern, formatter in topic_patterns.items():
275+
matches = re.finditer(pattern, question, re.IGNORECASE)
276+
for match in matches:
277+
keyword = formatter(match)
278+
if len(keyword.split()) <= 4:
279+
keywords.add(keyword)
280+
281+
# Convert to sorted list (prioritize by frequency if we had that data)
282+
keyword_list = sorted(list(keywords), key=len, reverse=True) # Longer keywords first (more specific)
283+
284+
# Limit to top 20 keywords to avoid overwhelming
285+
return keyword_list[:20]
213286

214287

215288
# Global analyzer instance

backend/validators/validation_metrics_tracker.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ class ValidationRecord:
3434
context_docs_count: int = 0
3535
has_citations: bool = False
3636
category: Optional[str] = None # e.g., "philosophical", "factual", "technical"
37+
validators_ran: Optional[List[str]] = None # NEW: List of validator names that ran (for performance tracking)
38+
validator_results: Optional[Dict[str, Dict[str, Any]]] = None # NEW: Per-validator results {validator_name: {passed, execution_time, reasons}}
3739

3840

3941
@dataclass
@@ -108,7 +110,9 @@ def record_validation(
108110
used_fallback: bool = False,
109111
context_docs_count: int = 0,
110112
has_citations: bool = False,
111-
category: Optional[str] = None
113+
category: Optional[str] = None,
114+
validators_ran: Optional[List[str]] = None, # NEW: List of validator names that ran
115+
validator_results: Optional[Dict[str, Dict[str, Any]]] = None # NEW: Per-validator detailed results
112116
) -> None:
113117
"""
114118
Record a validation result
@@ -136,7 +140,9 @@ def record_validation(
136140
used_fallback=used_fallback,
137141
context_docs_count=context_docs_count,
138142
has_citations=has_citations,
139-
category=category
143+
category=category,
144+
validators_ran=validators_ran or [],
145+
validator_results=validator_results or {}
140146
)
141147

142148
self._records.append(record)

backend/vector_db/rag_retrieval.py

Lines changed: 33 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -160,19 +160,15 @@ def retrieve_context(self,
160160

161161
# CRITICAL: Disable cache if this is a validator count question
162162
# Validator count questions need fresh retrieval to get latest foundational knowledge
163-
query_lower = query.lower()
164163
is_validator_count_query = any(
165-
keyword in query_lower for keyword in [
164+
keyword in query.lower() for keyword in [
166165
"bao nhiêu", "how many", "số", "number", "count",
167-
"lớp validator", "validator layer", "validator count",
168-
"có bao nhiêu", "how many layers", "how many validators",
169-
"số lớp", "số validator", "validator count", "layer count"
166+
"lớp validator", "validator layer", "validator count"
170167
]
171168
)
172-
# Also check if prioritize_foundational is True (indicates validator count question)
173-
if is_validator_count_query or prioritize_foundational:
169+
if is_validator_count_query:
174170
cache_enabled = False
175-
logger.info(f"🚫 Cache disabled for validator count question to ensure fresh retrieval (query: {query[:50]}...)")
171+
logger.info(f"🚫 Cache disabled for validator count question to ensure fresh retrieval")
176172

177173
cached_result = None
178174
cache_hit = False
@@ -461,6 +457,35 @@ def calculate_relevance_score(doc):
461457
knowledge_results = deduplicated_results
462458
logger.info(f"✅ Deduplicated: {len(knowledge_results)} unique documents (removed {len(seen_identifiers) - len(deduplicated_results) if seen_identifiers else 0} duplicates)")
463459

460+
# CRITICAL: Cross-Encoder Re-ranking (Gemini's recommendation)
461+
# Re-rank top-K documents using cross-encoder for better relevance
462+
# This addresses limitation where similarity search can be fooled by keyword matches
463+
if use_reranker and knowledge_results and not is_latest_query: # Don't rerank for latest queries (timestamp sorting is more important)
464+
try:
465+
from backend.vector_db.reranker import get_reranker, is_reranker_available
466+
467+
if is_reranker_available():
468+
reranker = get_reranker()
469+
# Re-rank top 10 documents (or all if less than 10)
470+
rerank_top_k = min(10, len(knowledge_results))
471+
logger.info(f"🔄 Re-ranking top {rerank_top_k} documents using cross-encoder...")
472+
473+
# Re-rank top documents
474+
reranked_docs = reranker.rerank(
475+
query=query,
476+
documents=knowledge_results[:rerank_top_k],
477+
top_k=rerank_top_k
478+
)
479+
480+
# Replace top documents with reranked ones, keep rest as-is
481+
knowledge_results = reranked_docs + knowledge_results[rerank_top_k:]
482+
logger.info(f"✅ Re-ranked {len(reranked_docs)} documents (cross-encoder)")
483+
else:
484+
logger.debug("ℹ️ Reranker not available (set ENABLE_RERANKER=true to enable)")
485+
except Exception as e:
486+
logger.warning(f"⚠️ Reranking failed (non-critical): {e}")
487+
# Continue with original order if reranking fails
488+
464489
# CRITICAL FIX: Sort by timestamp for "latest/newest" queries
465490
if is_latest_query and knowledge_results:
466491
logger.info(f"🕐 Sorting {len(knowledge_results)} documents by timestamp (newest first)")

0 commit comments

Comments
 (0)