Skip to content

Commit 19197fd

Browse files
committed
Chunker .add API
1 parent d2b2d7f commit 19197fd

File tree

2 files changed

+129
-1
lines changed

2 files changed

+129
-1
lines changed

chromadb/api/models/Collection.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import TYPE_CHECKING, Optional, Tuple, Any, Union
1+
from typing import TYPE_CHECKING, List, Optional, Tuple, Any, Union, cast
22

33
import numpy as np
44
from pydantic import BaseModel, PrivateAttr
@@ -8,6 +8,7 @@
88

99
from chromadb.api.types import (
1010
URI,
11+
Chunker,
1112
CollectionMetadata,
1213
DataLoader,
1314
Embedding,
@@ -114,6 +115,7 @@ def add(
114115
documents: Optional[OneOrMany[Document]] = None,
115116
images: Optional[OneOrMany[Image]] = None,
116117
uris: Optional[OneOrMany[URI]] = None,
118+
chunker: Optional[Chunker[Embeddable]] = None,
117119
) -> None:
118120
"""Add embeddings to the data store.
119121
Args:
@@ -123,6 +125,7 @@ def add(
123125
documents: The documents to associate with the embeddings. Optional.
124126
images: The images to associate with the embeddings. Optional.
125127
uris: The uris of the images to associate with the embeddings. Optional.
128+
chunker: Convert the input into chunks before embedding. Optional.
126129
127130
Returns:
128131
None
@@ -150,9 +153,30 @@ def add(
150153
# We need to compute the embeddings if they're not provided
151154
if embeddings is None:
152155
# At this point, we know that one of documents or images are provided from the validation above
156+
157+
def chunk_and_assign_ids(
158+
input: Embeddable, ids: IDs, chunker: Chunker[Embeddable]
159+
) -> Tuple[List[Embeddable], List[IDs]]:
160+
chunks = chunker(input)
161+
chunk_ids = []
162+
for chunk, id in zip(chunks, ids):
163+
chunk_ids.append([f"{id}_{i}" for i in range(len(chunk))])
164+
return chunks, chunk_ids
165+
153166
if documents is not None:
167+
if chunker is not None:
168+
chunks, chunk_ids = chunk_and_assign_ids(documents, ids, chunker)
169+
# Flatten chunk_ids and chunks, and assign them as the new values
170+
ids = [id for chunk_id in chunk_ids for id in chunk_id]
171+
documents = [
172+
cast(Document, doc) for chunk in chunks for doc in chunk
173+
]
174+
154175
embeddings = self._embed(input=documents)
155176
elif images is not None:
177+
if chunker is not None:
178+
# Image chunking is not supported yet
179+
raise NotImplementedError("Image chunking is not supported yet.")
156180
embeddings = self._embed(input=images)
157181
else:
158182
if uris is None:

chunker_example.ipynb

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"metadata": {},
7+
"outputs": [
8+
{
9+
"data": {
10+
"text/plain": [
11+
"[['This is a test of the chunker. It should chunk this text into a list of',\n",
12+
" 'sentences. This is the first sentence. This is the second sentence.',\n",
13+
" 'Folks, this example has so many beautiful sentences. I love it.',\n",
14+
" 'You guys. The test. Ten thousand blistering blue barnacles.',\n",
15+
" \"It's great.\\n\\n\\nIt's the best example ever. I'm so happy to be working on this.\"]]"
16+
]
17+
},
18+
"execution_count": 1,
19+
"metadata": {},
20+
"output_type": "execute_result"
21+
}
22+
],
23+
"source": [
24+
"from chromadb.utils.chunkers.default_chunker import DefaultTextChunker\n",
25+
"\n",
26+
"chunker = DefaultTextChunker()\n",
27+
"\n",
28+
"test_text = \"\"\"\n",
29+
"This is a test of the chunker. It should chunk this text into a list of\n",
30+
"sentences. This is the first sentence. This is the second sentence.\n",
31+
"\n",
32+
"\n",
33+
"Folks, this example has so many beautiful sentences. I love it.\n",
34+
"You guys. The test. Ten thousand blistering blue barnacles.\n",
35+
"\n",
36+
"It's great.\n",
37+
"\n",
38+
"\n",
39+
"It's the best example ever. I'm so happy to be working on this.\n",
40+
"\"\"\"\n",
41+
"\n",
42+
"chunker([test_text], max_chunk_size=100)"
43+
]
44+
},
45+
{
46+
"cell_type": "code",
47+
"execution_count": 2,
48+
"metadata": {},
49+
"outputs": [
50+
{
51+
"data": {
52+
"text/plain": [
53+
"{'ids': ['test_id_0', 'test_id_1', 'test_id_2', 'test_id_3', 'test_id_4'],\n",
54+
" 'embeddings': None,\n",
55+
" 'metadatas': [None, None, None, None, None],\n",
56+
" 'documents': ['This is a test of the chunker. It should chunk this text into a list of',\n",
57+
" 'sentences. This is the first sentence. This is the second sentence.',\n",
58+
" 'Folks, this example has so many beautiful sentences. I love it.',\n",
59+
" 'You guys. The test. Ten thousand blistering blue barnacles.',\n",
60+
" \"It's great.\\n\\n\\nIt's the best example ever. I'm so happy to be working on this.\"],\n",
61+
" 'uris': None,\n",
62+
" 'data': None,\n",
63+
" 'included': ['metadatas', 'documents']}"
64+
]
65+
},
66+
"execution_count": 2,
67+
"metadata": {},
68+
"output_type": "execute_result"
69+
}
70+
],
71+
"source": [
72+
"import chromadb\n",
73+
"\n",
74+
"client = chromadb.Client()\n",
75+
"collection = client.get_or_create_collection('test_collection')\n",
76+
"\n",
77+
"collection.add(ids=['test_id'], documents=[test_text], chunker=DefaultTextChunker(max_chunk_size=100))\n",
78+
"\n",
79+
"collection.get()"
80+
]
81+
}
82+
],
83+
"metadata": {
84+
"kernelspec": {
85+
"display_name": "chroma",
86+
"language": "python",
87+
"name": "python3"
88+
},
89+
"language_info": {
90+
"codemirror_mode": {
91+
"name": "ipython",
92+
"version": 3
93+
},
94+
"file_extension": ".py",
95+
"mimetype": "text/x-python",
96+
"name": "python",
97+
"nbconvert_exporter": "python",
98+
"pygments_lexer": "ipython3",
99+
"version": "3.12.2"
100+
}
101+
},
102+
"nbformat": 4,
103+
"nbformat_minor": 2
104+
}

0 commit comments

Comments
 (0)