Chunker .add API

atroyn · atroyn · commit 19197fd988b8 · 2024-05-29T20:59:07.000-07:00
diff --git a/chromadb/api/models/Collection.py b/chromadb/api/models/Collection.py
@@ -1,4 +1,4 @@
-from typing import TYPE_CHECKING, Optional, Tuple, Any, Union
+from typing import TYPE_CHECKING, List, Optional, Tuple, Any, Union, cast
 
 import numpy as np
 from pydantic import BaseModel, PrivateAttr
@@ -8,6 +8,7 @@
 
 from chromadb.api.types import (
     URI,
+    Chunker,
     CollectionMetadata,
     DataLoader,
     Embedding,
@@ -114,6 +115,7 @@ def add(
         documents: Optional[OneOrMany[Document]] = None,
         images: Optional[OneOrMany[Image]] = None,
         uris: Optional[OneOrMany[URI]] = None,
+        chunker: Optional[Chunker[Embeddable]] = None,
     ) -> None:
         """Add embeddings to the data store.
         Args:
@@ -123,6 +125,7 @@ def add(
             documents: The documents to associate with the embeddings. Optional.
             images: The images to associate with the embeddings. Optional.
             uris: The uris of the images to associate with the embeddings. Optional.
+            chunker: Convert the input into chunks before embedding. Optional.
 
         Returns:
             None
@@ -150,9 +153,30 @@ def add(
         # We need to compute the embeddings if they're not provided
         if embeddings is None:
             # At this point, we know that one of documents or images are provided from the validation above
+
+            def chunk_and_assign_ids(
+                input: Embeddable, ids: IDs, chunker: Chunker[Embeddable]
+            ) -> Tuple[List[Embeddable], List[IDs]]:
+                chunks = chunker(input)
+                chunk_ids = []
+                for chunk, id in zip(chunks, ids):
+                    chunk_ids.append([f"{id}_{i}" for i in range(len(chunk))])
+                return chunks, chunk_ids
+
             if documents is not None:
+                if chunker is not None:
+                    chunks, chunk_ids = chunk_and_assign_ids(documents, ids, chunker)
+                    # Flatten chunk_ids and chunks, and assign them as the new values
+                    ids = [id for chunk_id in chunk_ids for id in chunk_id]
+                    documents = [
+                        cast(Document, doc) for chunk in chunks for doc in chunk
+                    ]
+
                 embeddings = self._embed(input=documents)
             elif images is not None:
+                if chunker is not None:
+                    # Image chunking is not supported yet
+                    raise NotImplementedError("Image chunking is not supported yet.")
                 embeddings = self._embed(input=images)
             else:
                 if uris is None:
diff --git a/chunker_example.ipynb b/chunker_example.ipynb
@@ -0,0 +1,104 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[['This is a test of the chunker. It should chunk this text into a list of',\n",
+       "  'sentences. This is the first sentence. This is the second sentence.',\n",
+       "  'Folks, this example has so many beautiful sentences. I love it.',\n",
+       "  'You guys. The test. Ten thousand blistering blue barnacles.',\n",
+       "  \"It's great.\\n\\n\\nIt's the best example ever. I'm so happy to be working on this.\"]]"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from chromadb.utils.chunkers.default_chunker import DefaultTextChunker\n",
+    "\n",
+    "chunker = DefaultTextChunker()\n",
+    "\n",
+    "test_text = \"\"\"\n",
+    "This is a test of the chunker. It should chunk this text into a list of\n",
+    "sentences. This is the first sentence. This is the second sentence.\n",
+    "\n",
+    "\n",
+    "Folks, this example has so many beautiful sentences. I love it.\n",
+    "You guys. The test. Ten thousand blistering blue barnacles.\n",
+    "\n",
+    "It's great.\n",
+    "\n",
+    "\n",
+    "It's the best example ever. I'm so happy to be working on this.\n",
+    "\"\"\"\n",
+    "\n",
+    "chunker([test_text], max_chunk_size=100)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'ids': ['test_id_0', 'test_id_1', 'test_id_2', 'test_id_3', 'test_id_4'],\n",
+       " 'embeddings': None,\n",
+       " 'metadatas': [None, None, None, None, None],\n",
+       " 'documents': ['This is a test of the chunker. It should chunk this text into a list of',\n",
+       "  'sentences. This is the first sentence. This is the second sentence.',\n",
+       "  'Folks, this example has so many beautiful sentences. I love it.',\n",
+       "  'You guys. The test. Ten thousand blistering blue barnacles.',\n",
+       "  \"It's great.\\n\\n\\nIt's the best example ever. I'm so happy to be working on this.\"],\n",
+       " 'uris': None,\n",
+       " 'data': None,\n",
+       " 'included': ['metadatas', 'documents']}"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import chromadb\n",
+    "\n",
+    "client = chromadb.Client()\n",
+    "collection = client.get_or_create_collection('test_collection')\n",
+    "\n",
+    "collection.add(ids=['test_id'], documents=[test_text], chunker=DefaultTextChunker(max_chunk_size=100))\n",
+    "\n",
+    "collection.get()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "chroma",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}