Skip to content

Commit e202cb5

Browse files
committed
Improve RAG recall: more chunks, synonym expansion
1 parent 0379da4 commit e202cb5

2 files changed

Lines changed: 26 additions & 6 deletions

File tree

backend/app/agent/core.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -74,8 +74,8 @@ async def run(self, query: str, history: Optional[List[Dict[str, str]]] = None,
7474
logger.info(f"[RAG] Query: {query[:50]}...")
7575

7676
# Step 1: Execute BOTH searches (hybrid)
77-
semantic_results = await self.semantic_tool.execute(query=query, limit=5)
78-
keyword_results = await self.keyword_tool.execute(query=query, limit=5)
77+
semantic_results = await self.semantic_tool.execute(query=query, limit=10)
78+
keyword_results = await self.keyword_tool.execute(query=query, limit=10)
7979

8080
# Step 2: Merge with RRF and log which path was used
8181
if semantic_results and keyword_results:
@@ -97,9 +97,9 @@ async def run(self, query: str, history: Optional[List[Dict[str, str]]] = None,
9797
preview = doc.get('content', '')[:80].replace('\n', ' ')
9898
logger.info(f"Doc {i+1}: {source} - {preview}...")
9999

100-
# Step 3: Build context and generate answer
100+
# Step 3: Build context and generate answer (use top 8 chunks for better coverage)
101101
context_parts = []
102-
for doc in merged_docs[:5]:
102+
for doc in merged_docs[:8]:
103103
source = doc.get('metadata', {}).get('source', 'unknown')
104104
content = doc.get('content', '')
105105
if content:

backend/app/services/tools/keyword_search.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,20 @@
1313
'at', 'to', 'for', 'of', 'with', 'by', 'about', 'can', 'could', 'would',
1414
'should', 'have', 'has', 'had', 'be', 'been', 'being', 'this', 'that'}
1515

16+
# Query expansion: synonyms for better recall
17+
SYNONYMS = {
18+
'job': ['work', 'position', 'role', 'employment', 'experience'],
19+
'work': ['job', 'position', 'role', 'employment', 'experience'],
20+
'company': ['employer', 'organization', 'firm'],
21+
'skill': ['skills', 'technology', 'technologies', 'expertise'],
22+
'skills': ['skill', 'technology', 'technologies', 'expertise'],
23+
'education': ['degree', 'diploma', 'university', 'school', 'training'],
24+
'project': ['projects', 'portfolio', 'work'],
25+
'projects': ['project', 'portfolio', 'work'],
26+
'language': ['languages', 'speak', 'fluent'],
27+
'languages': ['language', 'speak', 'fluent'],
28+
}
29+
1630

1731
class KeywordSearchTool(BaseTool):
1832
name = "keyword_search"
@@ -21,7 +35,7 @@ class KeywordSearchTool(BaseTool):
2135
def _build_tsquery(self, query: str) -> str:
2236
"""
2337
Build OR-based tsquery from natural language query.
24-
Filters stop words and joins remaining terms with OR (|).
38+
Filters stop words, expands synonyms, joins with OR (|).
2539
"""
2640
# Extract words, lowercase, filter stop words
2741
words = re.findall(r'\b\w+\b', query.lower())
@@ -31,8 +45,14 @@ def _build_tsquery(self, query: str) -> str:
3145
# Fallback: use all words if filtering removed everything
3246
keywords = [w for w in words if len(w) > 2]
3347

48+
# Expand with synonyms for better recall
49+
expanded = set(keywords)
50+
for word in keywords:
51+
if word in SYNONYMS:
52+
expanded.update(SYNONYMS[word])
53+
3454
# Join with OR operator for PostgreSQL tsquery
35-
return ' | '.join(keywords) if keywords else query
55+
return ' | '.join(expanded) if expanded else query
3656

3757
async def execute(self, query: str, limit: int = 10) -> List[Dict[str, Any]]:
3858
"""

0 commit comments

Comments
 (0)