Added process-wide temp directory utility and updated lancedb to support it.

travis-bauer · travis-bauer · commit cd07eb9c2e1c · 2025-11-17T05:11:58.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,12 @@
   (assign, concat, longestStr, isIn, isNotIn, isTrue, isFalse, Hash), core.py (AbstractFieldSegment),
   search modules (whoosh.py, lancedb.py), llm modules (chat.py, embedding.py), data modules
   (mongo.py, text/chunking_units.py), and pipelines (basic_rag.py, vector_databases.py).
+- Added get_process_temp_dir() utility to allow creation of temporary directories that are removed
+  when the process exists using `atexit`, but are consistent withing. aprocess. 
+- Added support for `tmp://name` URI scheme in LanceDB path parameters. This enables process-scoped
+  temporary databases that are automatically cleaned up on exit. Temporary databases with the same name
+  share state within a process, making them ideal for testing or ephemeral workflows. Works alongside
+  existing `memory://` (in-memory) and file path options. Implemented via `get_process_temp_dir()`.
 
 ## 0.10.0
 ### New Features
diff --git a/src/talkpipe/app/chatterlang_workbench.py b/src/talkpipe/app/chatterlang_workbench.py
@@ -94,7 +94,7 @@ def emit(self, record):
         {
             "name": "RAG Pipeline with Vector Database",
             "description": "Build a complete RAG system with document indexing and querying",
-            "code": '# This example demonstrates a complete RAG (Retrieval-Augmented Generation) workflow.\n# It indexes documents into a vector database and then queries them with an LLM.\n\n# Sample knowledge base documents (in a real scenario, these would be from files or a database)\nCONST docs = "TalkPipe is a Python toolkit for building AI workflows. It provides a Unix-like pipeline syntax for chaining data transformations and LLM operations.|TalkPipe supports multiple LLM providers including OpenAI, Ollama, and Anthropic. You can switch between providers easily using configuration.|With TalkPipe, you can build RAG systems, multi-agent debates, and document processing pipelines. It uses Python generators for memory-efficient streaming.";\n\n# Step 1: Index documents into a vector database\nINPUT FROM echo[data=docs, delimiter="|"] \n    | toDict[field_list="_:text"] \n    | makeVectorDatabase[\n        path="./demo_knowledge_base",\n        embedding_model="nomic-embed-text",\n        embedding_source="ollama",\n        embedding_field="text"\n      ] \n    | print;\n\n# Step 2: Query the knowledge base with RAG\nINPUT FROM echo[data="What are the key benefits of using TalkPipe?"] \n    | toDict[field_list="_:text"] \n    | ragToText[\n        path="./demo_knowledge_base",\n        embedding_model="nomic-embed-text",\n        embedding_source="ollama",\n        completion_model="llama3.2",\n        completion_source="ollama",\n        content_field="text",\n        prompt_directive="Answer the question based on the background information provided.",\n        limit=3\n      ] \n    | print'
+            "code": '# This example demonstrates a complete RAG (Retrieval-Augmented Generation) workflow.\n# It indexes documents into a vector database and then queries them with an LLM.\n\n# Sample knowledge base documents (in a real scenario, these would be from files or a database)\nCONST docs = "TalkPipe is a Python toolkit for building AI workflows. It provides a Unix-like pipeline syntax for chaining data transformations and LLM operations.|TalkPipe supports multiple LLM providers including OpenAI, Ollama, and Anthropic. You can switch between providers easily using configuration.|With TalkPipe, you can build RAG systems, multi-agent debates, and document processing pipelines. It uses Python generators for memory-efficient streaming.";\n\n# Step 1: Index documents into a vector database\nINPUT FROM echo[data=docs, delimiter="|"] \n    | toDict[field_list="_:text"] \n    | makeVectorDatabase[\n        path="tmp://demo_knowledge_base",\n        embedding_model="nomic-embed-text",\n        embedding_source="ollama",\n        embedding_field="text"\n      ] \n    | print;\n\n# Step 2: Query the knowledge base with RAG\nINPUT FROM echo[data="What are the key benefits of using TalkPipe?"] \n    | toDict[field_list="_:text"] \n    | ragToText[\n        path="tmp://demo_knowledge_base",\n        embedding_model="nomic-embed-text",\n        embedding_source="ollama",\n        completion_model="llama3.2",\n        completion_source="ollama",\n        content_field="text",\n        prompt_directive="Answer the question based on the background information provided.",\n        limit=3\n      ] \n    | print'
         }
     ]
 }
@@ -732,7 +732,7 @@ def get_ui():
         <div id="cursorPosition">Line: 0, Column: 0</div>
       </div>
       <div class="button-group">
-        <button id="compileButton">Compile Script</button>
+        <button id="compileButton">Compile and Run Script</button>
         <button id="toggle-examples">Toggle Examples</button>
         <button id="log-button">Toggle Logs</button>
         <span id="compileLoadingIndicator" class="loading hidden">Compiling script...</span>
diff --git a/src/talkpipe/pipelines/basic_rag.py b/src/talkpipe/pipelines/basic_rag.py
@@ -61,14 +61,19 @@ def transform(self, input_iter):
 
 class AbstractRAGPipeline(AbstractSegment):
     """ Convenience segment that runs a RAG pipeline from search to prompt creation to LLM completion.
+
+    Path supports multiple URI schemes:
+    - File path: "./my_db" or "/path/to/db" - Persistent storage
+    - Memory: "memory://" - Ephemeral in-memory database (faster, no disk I/O)
+    - Temp: "tmp://name" - Process-scoped temporary database (shared by name, auto-cleanup on exit)
     """
-    
+
     def __init__(self,
                  embedding_model: Annotated[str, "Embedding model to use"],
                  embedding_source: Annotated[str, "Source of text to embed"],
                  completion_model: Annotated[str, "LLM model to use for completion"],
                  completion_source: Annotated[str, "Source of prompt for completion"],
-                 path: Annotated[str, "Path to the LanceDB database"],
+                 path: Annotated[str, "Path to LanceDB database. Supports file paths, 'memory://' for in-memory, or 'tmp://name' for process-scoped temp (auto-cleanup)"],
                  content_field: Annotated[Any, "Field to evaluate relevance on"],
                  prompt_directive: Annotated[str, "Directive to guide the evaluation"] = "Respond to the provided content based on the background information. If the background does not contain relevant information, respond with 'No relevant information found.'",
                  set_as: Annotated[str, "The field to set/append the result as."] = None,
@@ -113,14 +118,19 @@ def transform(self, input_iter):
 @register_segment("ragToText")
 class RAGToText(AbstractRAGPipeline):
     """ RAG pipeline that outputs text completions from LLM.
+
+    Path supports multiple URI schemes:
+    - File path: "./my_db" or "/path/to/db" - Persistent storage
+    - Memory: "memory://" - Ephemeral in-memory database (faster, no disk I/O)
+    - Temp: "tmp://name" - Process-scoped temporary database (shared by name, auto-cleanup on exit)
     """
 
     def __init__(self,
                  embedding_model: Annotated[str, "Embedding model to use"],
                  embedding_source: Annotated[str, "Source of text to embed"],
                  completion_model: Annotated[str, "LLM model to use for completion"],
                  completion_source: Annotated[str, "Source of prompt for completion"],
-                 path: Annotated[str, "Path to the LanceDB database"],
+                 path: Annotated[str, "Path to LanceDB database. Supports file paths, 'memory://' for in-memory, or 'tmp://name' for process-scoped temp (auto-cleanup)"],
                  content_field: Annotated[Any, "Field to evaluate relevance on"],
                  prompt_directive: Annotated[str, "Directive to guide the evaluation"] = "Respond to the provided content based on the background information. If the background does not contain relevant information, respond with 'No relevant information found.'",
                  set_as: Annotated[str, "The field to set/append the result as."] = None,
@@ -146,14 +156,19 @@ def make_completion_segment(self) -> AbstractSegment:
 @register_segment("ragToBinaryAnswer")
 class RAGToBinaryAnswer(AbstractRAGPipeline):
     """ RAG pipeline that outputs binary answers from LLM.
+
+    Path supports multiple URI schemes:
+    - File path: "./my_db" or "/path/to/db" - Persistent storage
+    - Memory: "memory://" - Ephemeral in-memory database (faster, no disk I/O)
+    - Temp: "tmp://name" - Process-scoped temporary database (shared by name, auto-cleanup on exit)
     """
 
     def __init__(self,
                  embedding_model: Annotated[str, "Embedding model to use"],
                  embedding_source: Annotated[str, "Source of text to embed"],
                  completion_model: Annotated[str, "LLM model to use for completion"],
                  completion_source: Annotated[str, "Source of prompt for completion"],
-                 path: Annotated[str, "Path to the LanceDB database"],
+                 path: Annotated[str, "Path to LanceDB database. Supports file paths, 'memory://' for in-memory, or 'tmp://name' for process-scoped temp (auto-cleanup)"],
                  content_field: Annotated[Any, "Field to evaluate relevance on"],
                  prompt_directive: Annotated[str, "Directive to guide the evaluation"] = "Answer the provided question as YES or NO. If the background does not contain relevant information, respond with 'NO'.",
                  set_as: Annotated[str, "The field to set/append the result as."] = None,
@@ -180,14 +195,19 @@ def make_completion_segment(self) -> AbstractSegment:
 @register_segment("ragToScore")
 class RAGToScore(AbstractRAGPipeline):
     """ RAG pipeline that outputs scores from LLM.
+
+    Path supports multiple URI schemes:
+    - File path: "./my_db" or "/path/to/db" - Persistent storage
+    - Memory: "memory://" - Ephemeral in-memory database (faster, no disk I/O)
+    - Temp: "tmp://name" - Process-scoped temporary database (shared by name, auto-cleanup on exit)
     """
 
     def __init__(self,
                  embedding_model: Annotated[str, "Embedding model to use"],
                  embedding_source: Annotated[str, "Source of text to embed"],
                  completion_model: Annotated[str, "LLM model to use for completion"],
                  completion_source: Annotated[str, "Source of prompt for completion"],
-                 path: Annotated[str, "Path to the LanceDB database"],
+                 path: Annotated[str, "Path to LanceDB database. Supports file paths, 'memory://' for in-memory, or 'tmp://name' for process-scoped temp (auto-cleanup)"],
                  content_field: Annotated[Any, "Field to evaluate relevance on"],
                  prompt_directive: Annotated[str, "Directive to guide the evaluation"] = "Answer the provided question on a scale of 1 to 10. If the background does not contain relevant information, respond with a score of 1.",
                  set_as: Annotated[str, "The field to set/append the result as."] = None,
diff --git a/src/talkpipe/pipelines/vector_databases.py b/src/talkpipe/pipelines/vector_databases.py
@@ -10,13 +10,18 @@ class MakeVectorDatabaseSegment(AbstractSegment):
 
     This segment expects dictionary inputs representing documents.
     It embeds the specified field and stores the documents with their embeddings in LanceDB.
+
+    Path supports multiple URI schemes:
+    - File path: "./my_db" or "/path/to/db" - Persistent storage
+    - Memory: "memory://" - Ephemeral in-memory database (faster, no disk I/O)
+    - Temp: "tmp://name" - Process-scoped temporary database (shared by name, auto-cleanup on exit)
     """
 
     def __init__(self,
                  embedding_field: Annotated[str, "Field to use for embeddings"],
                  embedding_model: Annotated[str, "Embedding model to use"],
                  embedding_source: Annotated[str, "Source of text to embed"],
-                 path: Annotated[str, "Path to the LanceDB database"],
+                 path: Annotated[str, "Path to LanceDB database. Supports file paths, 'memory://', or 'tmp://name'"],
                  table_name: Annotated[str, "Name of the table in the database"] = "docs",
                  doc_id_field: Annotated[Optional[str], "Field containing document ID"] = None,
                  overwrite: Annotated[bool, "If true, overwrite existing table"] = False,
@@ -52,12 +57,17 @@ class SearchVectorDatabaseSegment(AbstractSegment):
       search results are yielded (set_as must be None).
     - If query_field is specified: Expects dictionary inputs, embeds the specified field,
       and search results can be yielded directly (set_as=None) or attached to the input item.
+
+    Path supports multiple URI schemes:
+    - File path: "./my_db" or "/path/to/db" - Persistent storage
+    - Memory: "memory://" - Ephemeral in-memory database (faster, no disk I/O)
+    - Temp: "tmp://name" - Process-scoped temporary database (shared by name, auto-cleanup on exit)
     """
 
     def __init__(self,
                  embedding_model: Annotated[str, "Embedding model to use"],
                  embedding_source: Annotated[str, "Source of text to embed"],
-                 path: Annotated[str, "Path to the LanceDB database"],
+                 path: Annotated[str, "Path to LanceDB database. Supports file paths, 'memory://' for in-memory, or 'tmp://name' for process-scoped temp (auto-cleanup)"],
                  table_name: Annotated[str, "Name of the table in the database"] = "docs",
                  query_field: Annotated[Optional[str], "Field containing the query text to embed. If None, expects string inputs."] = None,
                  limit: Annotated[int, "Number of search results to return"] = 10,
diff --git a/src/talkpipe/search/lancedb.py b/src/talkpipe/search/lancedb.py
@@ -7,14 +7,56 @@
 from talkpipe.chatterlang import register_segment
 from talkpipe import segment
 from talkpipe.util.data_manipulation import extract_property, VectorLike, Document, DocID, toDict, assign_property
+from talkpipe.util.os import get_process_temp_dir
 from .abstract import DocumentStore, VectorAddable, VectorSearchable, SearchResult
 
 logger = logging.getLogger(__name__)
 
+
+def parse_db_path(path: str) -> str:
+    """
+    Parse database path, handling special URI schemes.
+
+    Supported schemes:
+    - Regular paths: "/path/to/db" -> "/path/to/db"
+    - Memory DBs: "memory://" or "" -> passes through to LanceDB
+    - Temp DBs: "tmp://name" -> process-wide temp directory path
+
+    Args:
+        path: Database path or URI
+
+    Returns:
+        Resolved path suitable for lancedb.connect()
+
+    Examples:
+        >>> parse_db_path("/data/mydb")
+        "/data/mydb"
+
+        >>> parse_db_path("memory://")
+        "memory://"
+
+        >>> parse_db_path("tmp://my_cache")
+        "/tmp/talkpipe_tmp/my_cache"  # actual temp dir
+
+    Raises:
+        ValueError: If tmp:// URI has no name
+    """
+    if path.startswith("tmp://"):
+        # Extract name from URI
+        name = path[6:]  # Remove "tmp://" prefix
+        if not name:
+            raise ValueError("tmp:// URI requires a name (e.g., tmp://my_db)")
+
+        # Get process-wide temp directory
+        return get_process_temp_dir(name)
+
+    # Pass through other URIs/paths
+    return path
+
 @register_segment("searchLanceDB", "searchLancDB")
 @segment()
 def search_lancedb(items: Annotated[object, "Items with the query vectors"],
-                   path: Annotated[str, "Path to the LanceDB database"],
+                   path: Annotated[str, "Path to the LanceDB database. Supports file paths, 'memory://' for in-memory, or 'tmp://name' for process-scoped temp (auto-cleanup)"],
                    table_name: Annotated[str, "Table name in the LanceDB database"],
                    all_results_at_once: Annotated[bool, "If true, return all results at once"]=False,
                    field: Annotated[str, "Field with the vector"]=None,
@@ -25,6 +67,11 @@ def search_lancedb(items: Annotated[object, "Items with the query vectors"],
                 ):
     """Search for similar vectors in LanceDB and return SearchResult objects.
 
+    The path parameter supports multiple URI schemes:
+    - File path: "./my_db" or "/path/to/db" - Persistent storage
+    - Memory: "memory://" - Ephemeral in-memory database (faster, no disk I/O)
+    - Temp: "tmp://name" - Process-scoped temporary database (shared by name, auto-cleanup on exit)
+
     Yields:
         SearchResult objects or lists of SearchResult objects.
     """
@@ -59,7 +106,7 @@ def search_lancedb(items: Annotated[object, "Items with the query vectors"],
 @register_segment("addToLanceDB", "addToLancDB")
 @segment()
 def add_to_lancedb(items: Annotated[object, "Items with the vectors and documents"],
-                   path: Annotated[str, "Path to the LanceDB database"],
+                   path: Annotated[str, "Path to the LanceDB database. Supports file paths, 'memory://' for in-memory, or 'tmp://name' for process-scoped temp (auto-cleanup)"],
                    table_name: Annotated[str, "Table name in the LanceDB database"],
                    vector_field: Annotated[str, "The field containing the vector data"] = "vector",
                    doc_id_field: Annotated[Optional[str], "Field containing document ID"] = None,
@@ -69,6 +116,11 @@ def add_to_lancedb(items: Annotated[object, "Items with the vectors and document
                    ):
     """Add vectors and documents to LanceDB using LanceDBDocumentStore.
 
+    The path parameter supports multiple URI schemes:
+    - File path: "./my_db" or "/path/to/db" - Persistent storage
+    - Memory: "memory://" - Ephemeral in-memory database (faster, no disk I/O)
+    - Temp: "tmp://name" - Process-scoped temporary database (shared by name, auto-cleanup on exit)
+
     Returns:
         The original items with the document IDs added.
     """
@@ -136,11 +188,15 @@ def __init__(self, path: str, table_name: str = "documents", vector_dim: Optiona
         Initialize the LanceDB document store.
 
         Args:
-            path: Path to the LanceDB database
+            path: Path to the LanceDB database. Supports:
+                  - Regular paths: "/path/to/db"
+                  - Memory DBs: "memory://"
+                  - Temp DBs: "tmp://name" (process-wide, auto-cleanup)
             table_name: Name of the table to store documents in
             vector_dim: Expected dimension of vectors (optional, inferred from first vector)
         """
-        self.path = path
+        self.original_path = path  # Keep original for reference
+        self.path = parse_db_path(path)  # Resolve tmp:// and other URIs
         self.table_name = table_name
         self.vector_dim = vector_dim
         self._db = None
diff --git a/src/talkpipe/util/os.py b/src/talkpipe/util/os.py
diff --git a/tests/talkpipe/util/test_os.py b/tests/talkpipe/util/test_os.py

Original file line number	Diff line number	Diff line change
`@@ -94,7 +94,7 @@ def emit(self, record):`
`94`	`94`	`{`
`95`	`95`	`"name": "RAG Pipeline with Vector Database",`
`96`	`96`	`"description": "Build a complete RAG system with document indexing and querying",`
`97`		- "code": '# This example demonstrates a complete RAG (Retrieval-Augmented Generation) workflow.\n# It indexes documents into a vector database and then queries them with an LLM.\n\n# Sample knowledge base documents (in a real scenario, these would be from files or a database)\nCONST docs = "TalkPipe is a Python toolkit for building AI workflows. It provides a Unix-like pipeline syntax for chaining data transformations and LLM operations.\|TalkPipe supports multiple LLM providers including OpenAI, Ollama, and Anthropic. You can switch between providers easily using configuration.\|With TalkPipe, you can build RAG systems, multi-agent debates, and document processing pipelines. It uses Python generators for memory-efficient streaming.";\n\n# Step 1: Index documents into a vector database\nINPUT FROM echo[data=docs, delimiter="\|"] \n \| toDict[field_list="_:text"] \n \| makeVectorDatabase[\n path="./demo_knowledge_base",\n embedding_model="nomic-embed-text",\n embedding_source="ollama",\n embedding_field="text"\n ] \n \| print;\n\n# Step 2: Query the knowledge base with RAG\nINPUT FROM echo[data="What are the key benefits of using TalkPipe?"] \n \| toDict[field_list="_:text"] \n \| ragToText[\n path="./demo_knowledge_base",\n embedding_model="nomic-embed-text",\n embedding_source="ollama",\n completion_model="llama3.2",\n completion_source="ollama",\n content_field="text",\n prompt_directive="Answer the question based on the background information provided.",\n limit=3\n ] \n \| print'
	`97`	+ "code": '# This example demonstrates a complete RAG (Retrieval-Augmented Generation) workflow.\n# It indexes documents into a vector database and then queries them with an LLM.\n\n# Sample knowledge base documents (in a real scenario, these would be from files or a database)\nCONST docs = "TalkPipe is a Python toolkit for building AI workflows. It provides a Unix-like pipeline syntax for chaining data transformations and LLM operations.\|TalkPipe supports multiple LLM providers including OpenAI, Ollama, and Anthropic. You can switch between providers easily using configuration.\|With TalkPipe, you can build RAG systems, multi-agent debates, and document processing pipelines. It uses Python generators for memory-efficient streaming.";\n\n# Step 1: Index documents into a vector database\nINPUT FROM echo[data=docs, delimiter="\|"] \n \| toDict[field_list="_:text"] \n \| makeVectorDatabase[\n path="tmp://demo_knowledge_base",\n embedding_model="nomic-embed-text",\n embedding_source="ollama",\n embedding_field="text"\n ] \n \| print;\n\n# Step 2: Query the knowledge base with RAG\nINPUT FROM echo[data="What are the key benefits of using TalkPipe?"] \n \| toDict[field_list="_:text"] \n \| ragToText[\n path="tmp://demo_knowledge_base",\n embedding_model="nomic-embed-text",\n embedding_source="ollama",\n completion_model="llama3.2",\n completion_source="ollama",\n content_field="text",\n prompt_directive="Answer the question based on the background information provided.",\n limit=3\n ] \n \| print'
`98`	`98`	`}`
`99`	`99`	`]`
`100`	`100`	`}`
`@@ -732,7 +732,7 @@ def get_ui():`
`732`	`732`	`<div id="cursorPosition">Line: 0, Column: 0</div>`
`733`	`733`	`</div>`
`734`	`734`	`<div class="button-group">`
`735`		`- <button id="compileButton">Compile Script</button>`
	`735`	`+ <button id="compileButton">Compile and Run Script</button>`
`736`	`736`	`<button id="toggle-examples">Toggle Examples</button>`
`737`	`737`	`<button id="log-button">Toggle Logs</button>`
`738`	`738`	`<span id="compileLoadingIndicator" class="loading hidden">Compiling script...</span>`