From 2f8558e37cf56944a88bcf2346022303137a2a5b Mon Sep 17 00:00:00 2001 From: "praisonai-triage-agent[bot]" <272766704+praisonai-triage-agent[bot]@users.noreply.github.com> Date: Sat, 23 May 2026 14:13:19 +0000 Subject: [PATCH 1/5] fix: make deduplicate_chunks and build_context handle SearchResultItem objects - Add _extract_value helper to handle both dict and SearchResultItem objects - Update type hints to accept Union[Dict[str, Any], Any] in context functions - Maintain full backward compatibility with existing dict-based usage - Fix AttributeError when SearchResultItem objects are passed to .get() method Fixes #1732 Co-authored-by: praisonai-triage-agent[bot] --- .../praisonaiagents/rag/context.py | 74 ++++++++++++++----- 1 file changed, 54 insertions(+), 20 deletions(-) diff --git a/src/praisonai-agents/praisonaiagents/rag/context.py b/src/praisonai-agents/praisonaiagents/rag/context.py index 11d7bef21..f513031eb 100644 --- a/src/praisonai-agents/praisonaiagents/rag/context.py +++ b/src/praisonai-agents/praisonaiagents/rag/context.py @@ -6,7 +6,7 @@ """ import hashlib -from typing import Any, Dict, List, Optional, Set, Tuple +from typing import Any, Dict, List, Optional, Set, Tuple, Union def _estimate_tokens(text: str) -> int: @@ -19,6 +19,24 @@ def _estimate_tokens(text: str) -> int: return len(text) // 4 + 1 +def _extract_value(item: Union[Dict[str, Any], Any], key: str, default: Any = None) -> Any: + """ + Extract value from either dict or object (e.g., SearchResultItem). + + Args: + item: Dictionary or object with attributes + key: Key/attribute name to extract + default: Default value if key not found + + Returns: + Extracted value or default + """ + if isinstance(item, dict): + return item.get(key, default) + else: + return getattr(item, key, default) + + def _chunk_hash(text: str, source: Optional[str] = None) -> str: """ Generate stable hash for chunk deduplication. @@ -35,31 +53,38 @@ def _chunk_hash(text: str, source: Optional[str] = None) -> str: def deduplicate_chunks( - results: List[Dict[str, Any]], + results: List[Union[Dict[str, Any], Any]], similarity_threshold: float = 0.9, -) -> List[Dict[str, Any]]: +) -> List[Union[Dict[str, Any], Any]]: """ Deduplicate chunks by content hash. + Handles both dict format and SearchResultItem objects. + Args: - results: List of search results with 'text' or 'memory' key + results: List of search results (dicts or SearchResultItem objects) similarity_threshold: Not used currently (hash-based dedup) Returns: Deduplicated list of results """ seen_hashes: Set[str] = set() - unique_results: List[Dict[str, Any]] = [] + unique_results: List[Union[Dict[str, Any], Any]] = [] for result in results: # Skip None results if result is None: continue - # Handle different result formats - text = result.get("text") or result.get("memory", "") + + # Handle different result formats (dict or SearchResultItem) + text = _extract_value(result, "text") or _extract_value(result, "memory", "") + # CRITICAL: Handle metadata=None from mem0 - ensure always dict - metadata = result.get("metadata") or {} - source = metadata.get("source", "") + metadata = _extract_value(result, "metadata") or {} + if not isinstance(metadata, dict): + metadata = {} + + source = metadata.get("source", "") if isinstance(metadata, dict) else "" chunk_id = _chunk_hash(text, source) @@ -106,17 +131,19 @@ def truncate_context( def build_context( - results: List[Dict[str, Any]], + results: List[Union[Dict[str, Any], Any]], max_tokens: int = 4000, deduplicate: bool = True, separator: str = "\n\n---\n\n", include_source: bool = True, -) -> Tuple[str, List[Dict[str, Any]]]: +) -> Tuple[str, List[Union[Dict[str, Any], Any]]]: """ Build context string from retrieval results. + Handles both dict format and SearchResultItem objects. + Args: - results: List of search results + results: List of search results (dicts or SearchResultItem objects) max_tokens: Maximum tokens for context deduplicate: Whether to deduplicate chunks separator: Separator between chunks @@ -133,7 +160,7 @@ def build_context( results = deduplicate_chunks(results) context_parts: List[str] = [] - used_results: List[Dict[str, Any]] = [] + used_results: List[Union[Dict[str, Any], Any]] = [] current_tokens = 0 separator_tokens = _estimate_tokens(separator) @@ -141,17 +168,21 @@ def build_context( # Skip None results if result is None: continue - # Handle different result formats - text = result.get("text") or result.get("memory", "") + + # Handle different result formats (dict or SearchResultItem) + text = _extract_value(result, "text") or _extract_value(result, "memory", "") if not text: continue # Build chunk text with optional source if include_source: # CRITICAL: Handle metadata=None from mem0 - ensure always dict - metadata = result.get("metadata") or {} - source = metadata.get("source", "") - filename = metadata.get("filename", "") + metadata = _extract_value(result, "metadata") or {} + if not isinstance(metadata, dict): + metadata = {} + + source = metadata.get("source", "") if isinstance(metadata, dict) else "" + filename = metadata.get("filename", "") if isinstance(metadata, dict) else "" source_label = filename or source or f"Source {i + 1}" chunk_text = f"[{source_label}]\n{text}" else: @@ -194,11 +225,14 @@ def __init__( def build( self, - results: List[Dict[str, Any]], + results: List[Union[Dict[str, Any], Any]], max_tokens: int = 4000, deduplicate: bool = True, ) -> str: - """Build context string from results.""" + """Build context string from results. + + Handles both dict format and SearchResultItem objects. + """ context, _ = build_context( results=results, max_tokens=max_tokens, From 4c3a9105370a56fba60054c4949f4d07dafac9f7 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 25 May 2026 08:21:18 +0000 Subject: [PATCH 2/5] fix: improve SearchResultItem handling in rag context builder Agent-Logs-Url: https://github.com/MervinPraison/PraisonAI/sessions/fbaed72c-c0f6-4426-8cb4-09c3542f499f Co-authored-by: MervinPraison <454862+MervinPraison@users.noreply.github.com> --- .../praisonaiagents/rag/context.py | 30 +++++++++++-------- .../unit/rag/test_context_normalization.py | 23 ++++++++++++++ 2 files changed, 41 insertions(+), 12 deletions(-) diff --git a/src/praisonai-agents/praisonaiagents/rag/context.py b/src/praisonai-agents/praisonaiagents/rag/context.py index f513031eb..6168bef35 100644 --- a/src/praisonai-agents/praisonaiagents/rag/context.py +++ b/src/praisonai-agents/praisonaiagents/rag/context.py @@ -6,7 +6,13 @@ """ import hashlib -from typing import Any, Dict, List, Optional, Set, Tuple, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, Union + +if TYPE_CHECKING: + from ..knowledge.models import SearchResultItem + + +ResultItem = Union[Dict[str, Any], "SearchResultItem"] def _estimate_tokens(text: str) -> int: @@ -19,7 +25,7 @@ def _estimate_tokens(text: str) -> int: return len(text) // 4 + 1 -def _extract_value(item: Union[Dict[str, Any], Any], key: str, default: Any = None) -> Any: +def _extract_value(item: ResultItem, key: str, default: Any = None) -> Any: """ Extract value from either dict or object (e.g., SearchResultItem). @@ -53,9 +59,9 @@ def _chunk_hash(text: str, source: Optional[str] = None) -> str: def deduplicate_chunks( - results: List[Union[Dict[str, Any], Any]], + results: List[ResultItem], similarity_threshold: float = 0.9, -) -> List[Union[Dict[str, Any], Any]]: +) -> List[ResultItem]: """ Deduplicate chunks by content hash. @@ -69,7 +75,7 @@ def deduplicate_chunks( Deduplicated list of results """ seen_hashes: Set[str] = set() - unique_results: List[Union[Dict[str, Any], Any]] = [] + unique_results: List[ResultItem] = [] for result in results: # Skip None results @@ -84,7 +90,7 @@ def deduplicate_chunks( if not isinstance(metadata, dict): metadata = {} - source = metadata.get("source", "") if isinstance(metadata, dict) else "" + source = metadata.get("source", "") or _extract_value(result, "source", "") chunk_id = _chunk_hash(text, source) @@ -131,12 +137,12 @@ def truncate_context( def build_context( - results: List[Union[Dict[str, Any], Any]], + results: List[ResultItem], max_tokens: int = 4000, deduplicate: bool = True, separator: str = "\n\n---\n\n", include_source: bool = True, -) -> Tuple[str, List[Union[Dict[str, Any], Any]]]: +) -> Tuple[str, List[ResultItem]]: """ Build context string from retrieval results. @@ -160,7 +166,7 @@ def build_context( results = deduplicate_chunks(results) context_parts: List[str] = [] - used_results: List[Union[Dict[str, Any], Any]] = [] + used_results: List[ResultItem] = [] current_tokens = 0 separator_tokens = _estimate_tokens(separator) @@ -181,8 +187,8 @@ def build_context( if not isinstance(metadata, dict): metadata = {} - source = metadata.get("source", "") if isinstance(metadata, dict) else "" - filename = metadata.get("filename", "") if isinstance(metadata, dict) else "" + source = metadata.get("source", "") or _extract_value(result, "source", "") + filename = metadata.get("filename", "") or _extract_value(result, "filename", "") source_label = filename or source or f"Source {i + 1}" chunk_text = f"[{source_label}]\n{text}" else: @@ -225,7 +231,7 @@ def __init__( def build( self, - results: List[Union[Dict[str, Any], Any]], + results: List[ResultItem], max_tokens: int = 4000, deduplicate: bool = True, ) -> str: diff --git a/src/praisonai-agents/tests/unit/rag/test_context_normalization.py b/src/praisonai-agents/tests/unit/rag/test_context_normalization.py index cb13a1bc6..68bf24688 100644 --- a/src/praisonai-agents/tests/unit/rag/test_context_normalization.py +++ b/src/praisonai-agents/tests/unit/rag/test_context_normalization.py @@ -8,6 +8,7 @@ deduplicate_chunks, build_context, ) +from praisonaiagents.knowledge.models import SearchResultItem class TestDeduplicateChunksNormalization: @@ -122,3 +123,25 @@ def test_empty_text_filtered(self): context, used = build_context(results) assert len(used) == 1 assert "valid content" in context + + +class TestSearchResultItemCompatibility: + """Tests for SearchResultItem object compatibility.""" + + def test_deduplicate_uses_source_attribute_fallback(self): + """Different source attributes should produce unique chunks.""" + results = [ + SearchResultItem(text="same content", source="a.pdf"), + SearchResultItem(text="same content", source="b.pdf"), + ] + deduped = deduplicate_chunks(results) + assert len(deduped) == 2 + + def test_build_context_uses_filename_attribute_fallback(self): + """Source label should use object filename/source attributes.""" + results = [ + SearchResultItem(text="chunk", filename="doc.md"), + ] + context, used = build_context(results, include_source=True) + assert len(used) == 1 + assert "doc.md" in context From 5906289c282580496a92e0f4fc0f83ede981e780 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 25 May 2026 08:21:55 +0000 Subject: [PATCH 3/5] refactor: tighten source fallback semantics in rag context Agent-Logs-Url: https://github.com/MervinPraison/PraisonAI/sessions/fbaed72c-c0f6-4426-8cb4-09c3542f499f Co-authored-by: MervinPraison <454862+MervinPraison@users.noreply.github.com> --- src/praisonai-agents/praisonaiagents/rag/context.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/praisonai-agents/praisonaiagents/rag/context.py b/src/praisonai-agents/praisonaiagents/rag/context.py index 6168bef35..36281d261 100644 --- a/src/praisonai-agents/praisonaiagents/rag/context.py +++ b/src/praisonai-agents/praisonaiagents/rag/context.py @@ -90,7 +90,9 @@ def deduplicate_chunks( if not isinstance(metadata, dict): metadata = {} - source = metadata.get("source", "") or _extract_value(result, "source", "") + source = metadata.get("source") + if source is None: + source = _extract_value(result, "source", "") chunk_id = _chunk_hash(text, source) @@ -187,8 +189,12 @@ def build_context( if not isinstance(metadata, dict): metadata = {} - source = metadata.get("source", "") or _extract_value(result, "source", "") - filename = metadata.get("filename", "") or _extract_value(result, "filename", "") + source = metadata.get("source") + if source is None: + source = _extract_value(result, "source", "") + filename = metadata.get("filename") + if filename is None: + filename = _extract_value(result, "filename", "") source_label = filename or source or f"Source {i + 1}" chunk_text = f"[{source_label}]\n{text}" else: From 0667726650b33e2340518af88147b6696af42e39 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 25 May 2026 08:22:31 +0000 Subject: [PATCH 4/5] refactor: dedupe metadata fallback extraction in rag context Agent-Logs-Url: https://github.com/MervinPraison/PraisonAI/sessions/fbaed72c-c0f6-4426-8cb4-09c3542f499f Co-authored-by: MervinPraison <454862+MervinPraison@users.noreply.github.com> --- .../praisonaiagents/rag/context.py | 27 ++++++++++++------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/src/praisonai-agents/praisonaiagents/rag/context.py b/src/praisonai-agents/praisonaiagents/rag/context.py index 36281d261..b6820dbf5 100644 --- a/src/praisonai-agents/praisonaiagents/rag/context.py +++ b/src/praisonai-agents/praisonaiagents/rag/context.py @@ -43,6 +43,21 @@ def _extract_value(item: ResultItem, key: str, default: Any = None) -> Any: return getattr(item, key, default) +def _extract_metadata_value( + item: ResultItem, + metadata: Dict[str, Any], + key: str, + default: Any = "", +) -> Any: + """ + Extract metadata value with fallback to top-level item attribute. + """ + value = metadata.get(key) + if value is None: + return _extract_value(item, key, default) + return value + + def _chunk_hash(text: str, source: Optional[str] = None) -> str: """ Generate stable hash for chunk deduplication. @@ -90,9 +105,7 @@ def deduplicate_chunks( if not isinstance(metadata, dict): metadata = {} - source = metadata.get("source") - if source is None: - source = _extract_value(result, "source", "") + source = _extract_metadata_value(result, metadata, "source", "") chunk_id = _chunk_hash(text, source) @@ -189,12 +202,8 @@ def build_context( if not isinstance(metadata, dict): metadata = {} - source = metadata.get("source") - if source is None: - source = _extract_value(result, "source", "") - filename = metadata.get("filename") - if filename is None: - filename = _extract_value(result, "filename", "") + source = _extract_metadata_value(result, metadata, "source", "") + filename = _extract_metadata_value(result, metadata, "filename", "") source_label = filename or source or f"Source {i + 1}" chunk_text = f"[{source_label}]\n{text}" else: From 984808d3bd050f5e667694a1c70fdcfa49bd48ca Mon Sep 17 00:00:00 2001 From: "praisonai-triage-agent[bot]" <272766704+praisonai-triage-agent[bot]@users.noreply.github.com> Date: Mon, 25 May 2026 08:26:22 +0000 Subject: [PATCH 5/5] test: add comprehensive SearchResultItem test coverage for RAG context module - Add tests for _extract_value and _extract_metadata_value helpers - Add tests for SearchResultItem deduplication functionality - Add tests for mixed dict/SearchResultItem format handling - Add tests for source/filename fallback behavior from top-level attributes - Verify metadata preference over top-level attributes works correctly Ensures all reviewer feedback scenarios are properly tested. Co-authored-by: Mervin Praison --- .../tests/unit/rag/test_context.py | 185 ++++++++++++++++++ 1 file changed, 185 insertions(+) diff --git a/src/praisonai-agents/tests/unit/rag/test_context.py b/src/praisonai-agents/tests/unit/rag/test_context.py index e84649f1e..a95313ad3 100644 --- a/src/praisonai-agents/tests/unit/rag/test_context.py +++ b/src/praisonai-agents/tests/unit/rag/test_context.py @@ -223,3 +223,188 @@ def test_default_builder_custom_separator(self): context = builder.build(results) assert "\n\n" in context + + +class TestExtractValue: + """Tests for the _extract_value helper function.""" + + def test_extract_value_from_dict(self): + """Test extracting value from dictionary.""" + from praisonaiagents.rag.context import _extract_value + + item = {"text": "test content", "metadata": {"source": "test.pdf"}} + + assert _extract_value(item, "text") == "test content" + assert _extract_value(item, "metadata")["source"] == "test.pdf" + assert _extract_value(item, "nonexistent", "default") == "default" + + def test_extract_value_from_searchresultitem(self): + """Test extracting value from SearchResultItem object.""" + from praisonaiagents.rag.context import _extract_value + from praisonaiagents.knowledge.models import SearchResultItem + + item = SearchResultItem( + text="test content", + source="test.pdf", + filename="test.pdf", + metadata={"extra": "data"} + ) + + assert _extract_value(item, "text") == "test content" + assert _extract_value(item, "source") == "test.pdf" + assert _extract_value(item, "filename") == "test.pdf" + assert _extract_value(item, "metadata")["extra"] == "data" + assert _extract_value(item, "nonexistent", "default") == "default" + + +class TestExtractMetadataValue: + """Tests for the _extract_metadata_value helper function.""" + + def test_extract_metadata_value_from_metadata(self): + """Test extracting value from metadata dict.""" + from praisonaiagents.rag.context import _extract_metadata_value + from praisonaiagents.knowledge.models import SearchResultItem + + item = SearchResultItem(text="content", source="fallback.pdf") + metadata = {"source": "metadata.pdf"} + + # Should prefer metadata over top-level + result = _extract_metadata_value(item, metadata, "source", "") + assert result == "metadata.pdf" + + def test_extract_metadata_value_fallback_to_toplevel(self): + """Test fallback to top-level when metadata doesn't have key.""" + from praisonaiagents.rag.context import _extract_metadata_value + from praisonaiagents.knowledge.models import SearchResultItem + + item = SearchResultItem(text="content", source="toplevel.pdf") + metadata = {} # Empty metadata + + # Should fall back to top-level source + result = _extract_metadata_value(item, metadata, "source", "") + assert result == "toplevel.pdf" + + def test_extract_metadata_value_with_default(self): + """Test fallback to default when neither metadata nor top-level have key.""" + from praisonaiagents.rag.context import _extract_metadata_value + from praisonaiagents.knowledge.models import SearchResultItem + + item = SearchResultItem(text="content") # No source + metadata = {} # No source in metadata + + # Should use default + result = _extract_metadata_value(item, metadata, "source", "default.pdf") + assert result == "default.pdf" + + +class TestSearchResultItemSupport: + """Tests for SearchResultItem object support in context functions.""" + + def test_deduplicate_chunks_with_searchresultitem(self): + """Test deduplication with SearchResultItem objects.""" + from praisonaiagents.rag.context import deduplicate_chunks + from praisonaiagents.knowledge.models import SearchResultItem + + results = [ + SearchResultItem(text="Same content", source="a.pdf"), + SearchResultItem(text="Same content", source="a.pdf"), # Duplicate + SearchResultItem(text="Different content", source="b.pdf"), + ] + + deduped = deduplicate_chunks(results) + assert len(deduped) == 2 # One duplicate should be removed + + def test_deduplicate_chunks_mixed_formats(self): + """Test deduplication with mixed dict and SearchResultItem formats.""" + from praisonaiagents.rag.context import deduplicate_chunks + from praisonaiagents.knowledge.models import SearchResultItem + + results = [ + {"text": "Content A", "metadata": {"source": "a.pdf"}}, + SearchResultItem(text="Content A", source="a.pdf"), # Should be deduped + SearchResultItem(text="Content B", source="b.pdf"), + ] + + deduped = deduplicate_chunks(results) + assert len(deduped) == 2 # One duplicate should be removed + + def test_deduplicate_with_top_level_source(self): + """Test deduplication considers top-level source from SearchResultItem.""" + from praisonaiagents.rag.context import deduplicate_chunks + from praisonaiagents.knowledge.models import SearchResultItem + + results = [ + SearchResultItem(text="Same text", source="source1.pdf"), + SearchResultItem(text="Same text", source="source2.pdf"), + ] + + deduped = deduplicate_chunks(results) + assert len(deduped) == 2 # Different sources should not be deduped + + def test_build_context_with_searchresultitem(self): + """Test building context with SearchResultItem objects.""" + from praisonaiagents.rag.context import build_context + from praisonaiagents.knowledge.models import SearchResultItem + + results = [ + SearchResultItem( + text="First content", + source="source1.pdf", + filename="file1.pdf" + ), + SearchResultItem( + text="Second content", + source="source2.pdf", + filename="file2.pdf" + ), + ] + + context, used = build_context(results) + + assert "First content" in context + assert "Second content" in context + assert "file1.pdf" in context # Should use filename + assert "file2.pdf" in context + assert len(used) == 2 + + def test_build_context_source_fallback(self): + """Test that build_context falls back to top-level source when metadata is empty.""" + from praisonaiagents.rag.context import build_context + from praisonaiagents.knowledge.models import SearchResultItem + + # SearchResultItem with source but empty metadata + results = [ + SearchResultItem( + text="Content with source", + source="fallback.pdf", + metadata={} # Empty metadata, should fall back to top-level source + ) + ] + + context, used = build_context(results, include_source=True) + + assert "Content with source" in context + assert "fallback.pdf" in context # Should use top-level source + assert len(used) == 1 + + def test_build_context_mixed_formats_with_sources(self): + """Test building context with mixed dict/SearchResultItem formats.""" + from praisonaiagents.rag.context import build_context + from praisonaiagents.knowledge.models import SearchResultItem + + results = [ + {"text": "Dict content", "metadata": {"filename": "dict.pdf"}}, + SearchResultItem( + text="Object content", + filename="object.pdf", + metadata={} + ), + ] + + context, used = build_context(results, include_source=True) + + assert "Dict content" in context + assert "Object content" in context + assert "dict.pdf" in context + assert "object.pdf" in context # Should use top-level filename + assert len(used) == 2