Skip to content

[DRAFT] Chunker .add API #2261

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: 05-29-anton_chunker
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 26 additions & 1 deletion chromadb/api/models/Collection.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
from typing import TYPE_CHECKING, Optional, Tuple, Any, Union, cast
from typing import TYPE_CHECKING, List, Optional, Tuple, Any, Union, cast

import numpy as np
from uuid import UUID
import chromadb.utils.embedding_functions as ef
from chromadb.api.types import (
URI,
Chunker,
CollectionMetadata,
DataLoader,
Embedding,
Expand Down Expand Up @@ -153,6 +155,7 @@ def add(
documents: Optional[OneOrMany[Document]] = None,
images: Optional[OneOrMany[Image]] = None,
uris: Optional[OneOrMany[URI]] = None,
chunker: Optional[Chunker[Embeddable]] = None,
) -> None:
"""Add embeddings to the data store.
Args:
Expand All @@ -162,6 +165,7 @@ def add(
documents: The documents to associate with the embeddings. Optional.
images: The images to associate with the embeddings. Optional.
uris: The uris of the images to associate with the embeddings. Optional.
chunker: Convert the input into chunks before embedding. Optional.

Returns:
None
Expand Down Expand Up @@ -189,9 +193,30 @@ def add(
# We need to compute the embeddings if they're not provided
if embeddings is None:
# At this point, we know that one of documents or images are provided from the validation above

def chunk_and_assign_ids(
input: Embeddable, ids: IDs, chunker: Chunker[Embeddable]
) -> Tuple[List[Embeddable], List[IDs]]:
chunks = chunker(input)
chunk_ids = []
for chunk, id in zip(chunks, ids):
chunk_ids.append([f"{id}_{i}" for i in range(len(chunk))])
return chunks, chunk_ids

if documents is not None:
if chunker is not None:
chunks, chunk_ids = chunk_and_assign_ids(documents, ids, chunker)
# Flatten chunk_ids and chunks, and assign them as the new values
ids = [id for chunk_id in chunk_ids for id in chunk_id]
documents = [
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Basically we expand the list of IDs by chunk ID. We should probably pass these out to the user (we currently do not pass IDs out) so they can keep track of the new IDs. I can do that here, on in a separate PR.

cast(Document, doc) for chunk in chunks for doc in chunk
]

embeddings = self._embed(input=documents)
elif images is not None:
if chunker is not None:
# Image chunking is not supported yet
raise NotImplementedError("Image chunking is not supported yet.")
embeddings = self._embed(input=images)
else:
if uris is None:
Expand Down
104 changes: 104 additions & 0 deletions chunker_example.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[['This is a test of the chunker. It should chunk this text into a list of',\n",
" 'sentences. This is the first sentence. This is the second sentence.',\n",
" 'Folks, this example has so many beautiful sentences. I love it.',\n",
" 'You guys. The test. Ten thousand blistering blue barnacles.',\n",
" \"It's great.\\n\\n\\nIt's the best example ever. I'm so happy to be working on this.\"]]"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from chromadb.utils.chunkers.default_chunker import DefaultTextChunker\n",
"\n",
"chunker = DefaultTextChunker()\n",
"\n",
"test_text = \"\"\"\n",
"This is a test of the chunker. It should chunk this text into a list of\n",
"sentences. This is the first sentence. This is the second sentence.\n",
"\n",
"\n",
"Folks, this example has so many beautiful sentences. I love it.\n",
"You guys. The test. Ten thousand blistering blue barnacles.\n",
"\n",
"It's great.\n",
"\n",
"\n",
"It's the best example ever. I'm so happy to be working on this.\n",
"\"\"\"\n",
"\n",
"chunker([test_text], max_chunk_size=100)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'ids': ['test_id_0', 'test_id_1', 'test_id_2', 'test_id_3', 'test_id_4'],\n",
" 'embeddings': None,\n",
" 'metadatas': [None, None, None, None, None],\n",
" 'documents': ['This is a test of the chunker. It should chunk this text into a list of',\n",
" 'sentences. This is the first sentence. This is the second sentence.',\n",
" 'Folks, this example has so many beautiful sentences. I love it.',\n",
" 'You guys. The test. Ten thousand blistering blue barnacles.',\n",
" \"It's great.\\n\\n\\nIt's the best example ever. I'm so happy to be working on this.\"],\n",
" 'uris': None,\n",
" 'data': None,\n",
" 'included': ['metadatas', 'documents']}"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import chromadb\n",
"\n",
"client = chromadb.Client()\n",
"collection = client.get_or_create_collection('test_collection')\n",
"\n",
"collection.add(ids=['test_id'], documents=[test_text], chunker=DefaultTextChunker(max_chunk_size=100))\n",
"\n",
"collection.get()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "chroma",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading