Enhance Document Management (#528)

* Patch/fix minor doc mgmt issues rebased (#519) * up * up * patching subtle issues with doc management * cleanups * fix args and kwargs dev (#520) * add qs (#521) * Patch/enforce dictionary exists (#522) * add qs * force existence of dictionary * Add health endpoint * Update README.md (#524) * Fix gitignore * Remove coverage.xml * Patch/finalize v0 kg agent (#527) * add qs * force existence of dictionary * up * Have health response be 'response': 'ok' * Feature/revive multi search web (#531) * add qs * force existence of dictionary * up * revive multi search web queries * roll back config * update lock * Patch/max depth error (#532) * add qs * force existence of dictionary * up * revive multi search web queries * roll back config * Patch/max depth error (#533) * add qs * force existence of dictionary * up * revive multi search web queries * roll back config * fix * Feature/patch r2r (#534) * add qs * force existence of dictionary * up * revive multi search web queries * roll back config * fix * finish changes * move dev dependencies --------- Co-authored-by: NolanTrem <[email protected]>
SciPhi-AI · Jun 25, 2024 · 4b3a389 · 4b3a389
1 parent 3a9f47d
commit 4b3a389
Show file tree

Hide file tree

Showing 33 changed files with 344 additions and 361 deletions.
diff --git a/.coverage b/.coverage
diff --git a/.github/.codecov.yml b/.github/.codecov.yml
@@ -0,0 +1,5 @@
+coverage:
+  status:
+    project:
+      default:
+        threshold: 100%
diff --git a/.gitignore b/.gitignore
@@ -10,6 +10,8 @@ dump/*
 .next
 node_modules
 
+coverage.xml
+.coverage
 
 **/*.sqlite*
 **/*.sqlite3*

diff --git a/README.md b/README.md
@@ -169,15 +169,21 @@ Interact with R2R using our [open-source React+Next.js dashboard](https://github
 - [Discord](https://discord.gg/p6KqD2kjtB): Chat live with maintainers and community members
 - [Github Issues](https://github.com/SciPhi-AI/R2R/issues): Report bugs and request features
 
-Explore our [R2R Docs](https://r2r-docs.sciphi.ai/) for tutorials and cookbooks on various R2R features and integrations, including:
-- [Client-Server](https://r2r-docs.sciphi.ai/cookbooks/client-server)
-- [Multiple LLMs](https://r2r-docs.sciphi.ai/cookbooks/multiple-llms)
-- [Knowledge Graph RAG](https://r2r-docs.sciphi.ai/cookbooks/knowledge-graph)
-- [Multimodal RAG](https://r2r-docs.sciphi.ai/cookbooks/multimodal)
-- [Hybrid Search](https://r2r-docs.sciphi.ai/cookbooks/hybrid-search)
-- [Local RAG](https://r2r-docs.sciphi.ai/cookbooks/local-rag)
-- [Reranking](https://r2r-docs.sciphi.ai/cookbooks/rerank-search)
-- [Dashboard](https://r2r-docs.sciphi.ai/cookbooks/dashboard)
+**Explore our [R2R Docs](https://r2r-docs.sciphi.ai/) for tutorials and cookbooks on various R2R features and integrations, including:**
+
+### RAG Cookbooks
+- [Multiple LLMs](https://r2r-docs.sciphi.ai/cookbooks/multiple-llms): A simple cookbook showing how R2R supports multiple LLMs.
+- [Hybrid Search](https://r2r-docs.sciphi.ai/cookbooks/hybrid-search): A brief introduction to running hybrid search with R2R.
+- [Multimodal RAG](https://r2r-docs.sciphi.ai/cookbooks/multimodal): A cookbook on multimodal RAG with R2R.
+- [Knowledge Graphs](https://r2r-docs.sciphi.ai/cookbooks/knowledge-graph): A walkthrough of automatic knowledge graph generation with R2R.
+- [Local RAG](https://r2r-docs.sciphi.ai/cookbooks/local-rag): A quick cookbook demonstration of how to run R2R with local LLMs.
+- [Reranking](https://r2r-docs.sciphi.ai/cookbooks/rerank-search): A short guide on how to apply reranking to R2R results.
+
+### App Features
+- [Client-Server](https://r2r-docs.sciphi.ai/cookbooks/client-server): An extension of the basic `R2R Quickstart` with client-server interactions.
+- [Document Management](https://r2r-docs.sciphi.ai/cookbooks/document-management): A cookbook showing how to manage your documents with R2R.
+- [Analytics & Observability](https://r2r-docs.sciphi.ai/cookbooks/observablity): A cookbook showing R2Rs end to end logging and analytics.
+- [Dashboard](https://r2r-docs.sciphi.ai/cookbooks/dashboard): A how-to guide on connecting with the R2R Dashboard.
 
 # Contributing
 

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 
 [tool.poetry]
 name = "r2r"
-version = "0.2.37"
+version = "0.2.38"
 description = "SciPhi R2R"
 authors = ["Owen Colegrove <[email protected]>"]
 license = "MIT"
@@ -58,24 +58,22 @@ litellm = "^1.35.18"
 openai = "^1.11.1"
 
 # integrations
-ionic-api-sdk = {version = "0.9.3", optional = true}
-exa-py = {version = "^1.0.9", optional = true}
 dateutils = "^0.6.12"
 fsspec = "^2024.6.0"
 posthog = "^3.5.0"
-neo4j = {version = "^5.21.0", optional = true}
 sqlalchemy = "^2.0.30"
 ollama = "^0.2.1"
+neo4j = {version = "^5.21.0", optional = true}
 
 [tool.poetry.extras]
+all = ["tiktoken", "sentence-transformers", "neo4j", "moviepy", "opencv-python"]
 kg = ["neo4j"]
-all = ["tiktoken", "sentence-transformers"]
-exa = ["exa-py"]
-ionic = ["ionic-api-sdk"]
 local-embedding = ["sentence-transformers"]
 ingest-movies = ["moviepy", "opencv-python"]
+
 [tool.poetry.group.dev.dependencies]
 black = "^24.3.0"
+codecov = "^2.1.13"
 flake8 = "6.1.0"
 isort = "5.12.0"
 mypy = "^1.5.1"
@@ -84,6 +82,7 @@ pytest = "^8.2.0"
 pytest-asyncio = "^0.23.6"
 pytest-dependency = "^0.6.0"
 pytest-mock = "^3.14.0"
+pytest-cov = "^5.0.0"
 
 [tool.black]
 line-length = 79
@@ -95,3 +94,9 @@ exclude = 'playground/.*|deprecated/.*|dump/.*|docs/source|vecs/*'
 [[tool.mypy.overrides]]
 module = "yaml"
 ignore_missing_imports = true
+
+[tool.pytest.ini_options]
+addopts = "--cov=r2r --cov-report=term-missing --cov-report=xml"
+testpaths = [
+    "tests",
+]
diff --git a/r2r/core/abstractions/document.py b/r2r/core/abstractions/document.py
@@ -52,6 +52,17 @@ def __init__(self, *args, **kwargs):
                 # If it's not base64, encode it to bytes
                 kwargs["data"] = data.encode("utf-8")
 
+        # Generate UUID based on the hash of the data
+        if "id" not in kwargs:
+            if isinstance(kwargs["data"], bytes):
+                data_hash = uuid.uuid5(
+                    uuid.NAMESPACE_DNS, kwargs["data"].decode("utf-8")
+                )
+            else:
+                data_hash = uuid.uuid5(uuid.NAMESPACE_DNS, kwargs["data"])
+
+            kwargs["id"] = data_hash  # Set the id based on the data hash
+
         super().__init__(*args, **kwargs)
 
     class Config:
@@ -71,6 +82,7 @@ class DocumentInfo(BaseModel):
     metadata: dict
 
     user_id: Optional[uuid.UUID] = None
+    title: Optional[str] = None
     created_at: Optional[datetime] = None
     updated_at: Optional[datetime] = None
 
@@ -81,6 +93,7 @@ def convert_to_db_entry(self):
         metadata["user_id"] = (
             str(metadata["user_id"]) if "user_id" in metadata else None
         )
+        metadata["title"] = metadata.get("title", "N/A")
         return {
             "document_id": str(self.document_id),
             "title": metadata.get("title", "N/A"),

diff --git a/r2r/core/abstractions/search.py b/r2r/core/abstractions/search.py
@@ -1,7 +1,7 @@
 """Abstractions for search functionality."""
 
 import uuid
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Tuple
 
 from pydantic import BaseModel, Field
 
@@ -43,7 +43,8 @@ class KGSearchRequest(BaseModel):
     query: str
 
 
-KGSearchResult = List[List[Dict[str, Any]]]
+# [query, ...]
+KGSearchResult = List[Tuple[str, List[Dict[str, Any]]]]
 
 
 class AggregateSearchResult(BaseModel):
@@ -60,16 +61,18 @@ def __repr__(self) -> str:
 
     def dict(self) -> dict:
         return {
-            "vector_search_results": [
-                result.dict() for result in self.vector_search_results
-            ],
-            "kg_search_results": self.kg_search_results,
+            "vector_search_results": (
+                [result.dict() for result in self.vector_search_results]
+                if self.vector_search_results
+                else []
+            ),
+            "kg_search_results": self.kg_search_results or [],
         }
 
 
 class VectorSearchSettings(BaseModel):
     use_vector_search: bool = True
-    search_filters: Optional[dict[str, Any]] = Field(default_factory=dict)
+    search_filters: dict[str, Any] = Field(default_factory=dict)
     search_limit: int = 10
     do_hybrid_search: bool = False
 

diff --git a/r2r/core/providers/kg_provider.py b/r2r/core/providers/kg_provider.py
@@ -114,7 +114,7 @@ def update_extraction_prompt(
 
     # TODO - Type this method.
     @abstractmethod
-    def update_agent_prompt(
+    def update_kg_agent_prompt(
         self,
         prompt_provider: Any,
         entity_types: list[Any],

diff --git a/r2r/examples/configs/pgvector.json b/r2r/examples/configs/pgvector.json
diff --git a/r2r/examples/quickstart.py b/r2r/examples/quickstart.py
@@ -37,7 +37,7 @@ class R2RQuickstart:
 
     def __init__(
         self,
-        config_name: Optional[str] = "default",
+        config_name: Optional[str] = None,
         config_path: Optional[str] = None,
         file_list: Optional[list[str]] = None,
         file_tuples: Optional[list[tuple]] = None,
@@ -46,6 +46,8 @@ def __init__(
     ):
         if config_path and config_name:
             raise ValueError("Cannot specify both config and config_name")
+        if not config_name and not config_path:
+            config_name = "default"
 
         if config_path:
             config = R2RConfig.from_json(config_path)
@@ -124,10 +126,6 @@ def ingest_documents(self, file_paths: Optional[list[str]] = None):
                 documents_dicts, monitor=True
             )
         else:
-            print(
-                "calling ingest_documents iwth documents = ",
-                [document.metadata for document in documents],
-            )
             response = self.r2r_app.ingest_documents(documents)
 
         t1 = time.time()
@@ -303,7 +301,7 @@ def search(
                 query,
                 VectorSearchSettings(
                     use_vector_search=use_vector_search,
-                    search_filters=search_filters,
+                    search_filters=search_filters or {},
                     search_limit=search_limit,
                     do_hybrid_search=do_hybrid_search,
                 ),
@@ -357,7 +355,7 @@ def rag(
             response = self.client.rag(
                 query=query,
                 use_vector_search=use_vector_search,
-                search_filters=search_filters,
+                search_filters=search_filters or {},
                 search_limit=search_limit,
                 do_hybrid_search=do_hybrid_search,
                 use_kg_search=use_kg_search,
@@ -380,7 +378,7 @@ def rag(
                 query,
                 vector_search_settings=VectorSearchSettings(
                     use_vector_search=use_vector_search,
-                    search_filters=search_filters,
+                    search_filters=search_filters or {},
                     search_limit=search_limit,
                     do_hybrid_search=do_hybrid_search,
                 ),

diff --git a/r2r/examples/scripts/advanced_kg_cookbook.py b/r2r/examples/scripts/advanced_kg_cookbook.py
@@ -7,11 +7,11 @@
 from r2r import (
     Document,
     EntityType,
-    KGAgentSearchPipe,
-    Pipeline,
+    KGSearchSettings,
     R2RAppBuilder,
     Relation,
-    run_pipeline,
+    VectorSearchSettings,
+    generate_id_from_label,
 )
 from r2r.core.abstractions.llm import GenerationConfig
 
@@ -202,9 +202,10 @@ def main(max_entries=50, delete=False):
             r2r_app.ingest_documents(
                 [
                     Document(
+                        id=generate_id_from_label(company),
                         type="txt",
                         data=company_data,
-                        metadata={},
+                        metadata={"title": company},
                     )
                 ]
             )
@@ -214,34 +215,26 @@ def main(max_entries=50, delete=False):
 
     print_all_relationships(kg)
 
-    kg_agent_search_pipe = KGAgentSearchPipe(
-        r2r_app.providers.kg, r2r_app.providers.llm, r2r_app.providers.prompt
-    )
-
-    # Define the pipeline
-    kg_pipe = Pipeline()
-    kg_pipe.add_pipe(kg_agent_search_pipe)
-
-    kg.update_agent_prompt(prompt_provider, entity_types, relations)
+    # the default prompt is `kg_agent` in `prompts/local/defaults.jsonl`
+    # `update_kg_agent_prompt` updates this with `kg_agent_with_spec`,
+    # after updating the prompt with the specified entity types and relations
+    kg.update_kg_agent_prompt(prompt_provider, entity_types, relations)
 
-    import asyncio
-
-    def restart_event_loop():
-        loop = asyncio.get_event_loop()
-        if loop.is_closed():
-            loop = asyncio.new_event_loop()
-            asyncio.set_event_loop(loop)
-        return loop
+    result = r2r_app.search(
+        query="Find up to 10 founders that worked at Google",
+        kg_search_settings=KGSearchSettings(use_kg_search=True),
+        vector_search_settings=VectorSearchSettings(use_vector_search=False),
+    )
 
-    restart_event_loop()
+    print("Search Result:\n", result["kg_search_results"])
 
-    agent_result = run_pipeline(
-        kg_pipe,
-        "Find up to 10 founders that worked at Google",
+    result = r2r_app.rag(
+        query="Find up to 10 founders that worked at Google",
+        kg_search_settings=KGSearchSettings(use_kg_search=True),
+        vector_search_settings=VectorSearchSettings(use_vector_search=False),
         rag_generation_config=GenerationConfig(model="gpt-4o"),
     )
-
-    print(agent_result)
+    print("RAG Result:\n", result)
 
 
 if __name__ == "__main__":

diff --git a/r2r/examples/scripts/run_hyde.py b/r2r/examples/scripts/run_hyde.py
@@ -1,7 +1,10 @@
+import fire
+
 from r2r import R2RAppBuilder, R2RConfig, R2RPipeFactoryWithMultiSearch
 from r2r.core.abstractions.llm import GenerationConfig
 
-if __name__ == "__main__":
+
+def main(task_prompt_name="hyde", query="Who was aristotle?"):
     # Load the configuration file
     config = R2RConfig.from_json()
 
@@ -10,15 +13,19 @@
         .with_pipe_factory(R2RPipeFactoryWithMultiSearch)
         .build(
             # Add optional override arguments which propagate to the pipe factory
-            task_prompt_name="hyde",
+            task_prompt_name=task_prompt_name,
         )
     )
 
     # Run the RAG pipeline through the R2R application
     result = r2r_app.rag(
-        "Who was aristotle?",
+        query,
         query_transform_generation_config=GenerationConfig(model="gpt-4o"),
         rag_generation_config=GenerationConfig(model="gpt-3.5-turbo"),
     )
 
     print(f"Final Result:\n\n{result}")
+
+
+if __name__ == "__main__":
+    fire.Fire(main)
diff --git a/r2r/examples/scripts/run_web_multi_search.py → r2r/examples/scripts/run_web_multi_rag.py b/r2r/examples/scripts/run_web_multi_search.py → r2r/examples/scripts/run_web_multi_rag.py
@@ -15,6 +15,7 @@ def run_rag_pipeline(query="Who was Aristotle?"):
 
     # Define a new synthetic query generation template
     synthetic_query_generation_template = {
+        "name": "synthetic_query_generation_template",
         "template": """
             ### Instruction:
             Given the following query, write a double newline separated list of up to {num_outputs} advanced queries meant to help answer the original query.

diff --git a/r2r/examples/scripts/run_web_search.py → r2r/examples/scripts/run_web_rag.py b/r2r/examples/scripts/run_web_search.py → r2r/examples/scripts/run_web_rag.py
@@ -10,7 +10,7 @@ def run_rag_pipeline(query="Who was Aristotle?"):
         serper_client=SerperClient()  # TODO - Develop a `WebSearchProvider` for configurability
     )
 
-    r2r_app = R2RAppBuilder().with_search_pipe(web_search_pipe).build()
+    r2r_app = R2RAppBuilder().with_vector_search_pipe(web_search_pipe).build()
 
     # Run the RAG pipeline through the R2R application
     result = r2r_app.rag(

diff --git a/r2r/examples/servers/configurable_pipeline.py b/r2r/examples/servers/configurable_pipeline.py
@@ -57,7 +57,9 @@ def r2r_app(
         web_search_pipe = WebSearchPipe(
             serper_client=SerperClient()  # TODO - Develop a `WebSearchProvider` for configurability
         )
-        builder = R2RAppBuilder(config).with_search_pipe(web_search_pipe)
+        builder = R2RAppBuilder(config).with_vector_search_pipe(
+            web_search_pipe
+        )
     # elif pipeline_type == PipelineType.HYDE:
     #     builder =  (
     #         R2RAppBuilder(config)

diff --git a/r2r/integrations/__init__.py b/r2r/integrations/__init__.py
@@ -1,5 +1,3 @@
-from .exa import ExaClient
-from .ionic import IonicClient
 from .serper import SerperClient
 
-__all__ = ["ExaClient", "IonicClient", "SerperClient"]
+__all__ = ["SerperClient"]