Skip to content

Commit 88b08a7

Browse files
rfctr: chroma destination migrated to V2 (#3214)
Moving Chroma destination to the V2 version of connectors.
1 parent 8610bd3 commit 88b08a7

File tree

8 files changed

+495
-0
lines changed

8 files changed

+495
-0
lines changed

Diff for: unstructured/ingest/v2/cli/cmds/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import click
44

5+
from .chroma import chroma_dest_cmd
56
from .elasticsearch import elasticsearch_dest_cmd, elasticsearch_src_cmd
67
from .fsspec.azure import azure_dest_cmd, azure_src_cmd
78
from .fsspec.box import box_dest_cmd, box_src_cmd
@@ -35,6 +36,7 @@
3536
dest_cmds = [
3637
azure_dest_cmd,
3738
box_dest_cmd,
39+
chroma_dest_cmd,
3840
dropbox_dest_cmd,
3941
elasticsearch_dest_cmd,
4042
gcs_dest_cmd,

Diff for: unstructured/ingest/v2/cli/cmds/chroma.py

+108
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
from dataclasses import dataclass
2+
3+
import click
4+
5+
from unstructured.ingest.cli.interfaces import Dict
6+
from unstructured.ingest.v2.cli.base import DestCmd
7+
from unstructured.ingest.v2.cli.interfaces import CliConfig
8+
from unstructured.ingest.v2.processes.connectors.chroma import CONNECTOR_TYPE
9+
10+
11+
@dataclass
12+
class ChromaCliConnectionConfig(CliConfig):
13+
@staticmethod
14+
def get_cli_options() -> list[click.Option]:
15+
options = [
16+
click.Option(
17+
["--path"],
18+
required=False,
19+
type=str,
20+
help="Location where Chroma is persisted," "if not connecting via http.",
21+
),
22+
click.Option(
23+
["--settings"],
24+
required=False,
25+
type=Dict(),
26+
help="A dictionary of settings to communicate with the chroma server."
27+
'example: \'{"persist_directory":"./chroma-persist"}\' ',
28+
),
29+
click.Option(
30+
["--tenant"],
31+
required=False,
32+
default="default_tenant",
33+
type=str,
34+
help="The tenant to use for this client. Chroma defaults to 'default_tenant'.",
35+
),
36+
click.Option(
37+
["--database"],
38+
required=False,
39+
default="default_database",
40+
type=str,
41+
help="The database to use for this client."
42+
"Chroma defaults to 'default_database'.",
43+
),
44+
click.Option(
45+
["--host"],
46+
required=False,
47+
type=str,
48+
help="The hostname of the Chroma server.",
49+
),
50+
click.Option(
51+
["--port"],
52+
required=False,
53+
type=int,
54+
help="The port of the Chroma server.",
55+
),
56+
click.Option(
57+
["--ssl"],
58+
required=False,
59+
default=False,
60+
is_flag=True,
61+
type=bool,
62+
help="Whether to use SSL to connect to the Chroma server.",
63+
),
64+
click.Option(
65+
["--headers"],
66+
required=False,
67+
type=Dict(),
68+
help="A dictionary of headers to send to the Chroma server."
69+
'example: \'{"Authorization":"Basic()"}\' ',
70+
),
71+
click.Option(
72+
["--collection-name"],
73+
required=True,
74+
type=str,
75+
help="The name of the Chroma collection to write into.",
76+
),
77+
]
78+
return options
79+
80+
81+
@dataclass
82+
class ChromaCliUploaderConfig(CliConfig):
83+
@staticmethod
84+
def get_cli_options() -> list[click.Option]:
85+
options = [
86+
click.Option(
87+
["--batch-size"],
88+
default=100,
89+
type=int,
90+
help="Number of records per batch",
91+
)
92+
]
93+
return options
94+
95+
96+
@dataclass
97+
class ChromaCliUploadStagerConfig(CliConfig):
98+
@staticmethod
99+
def get_cli_options() -> list[click.Option]:
100+
return []
101+
102+
103+
chroma_dest_cmd = DestCmd(
104+
cmd_name=CONNECTOR_TYPE,
105+
connection_config=ChromaCliConnectionConfig,
106+
uploader_config=ChromaCliUploaderConfig,
107+
upload_stager_config=ChromaCliUploadStagerConfig,
108+
)

Diff for: unstructured/ingest/v2/examples/example_chroma.py

+53
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
import random
2+
from pathlib import Path
3+
4+
from unstructured.ingest.v2.interfaces import ProcessorConfig
5+
from unstructured.ingest.v2.logger import logger
6+
from unstructured.ingest.v2.pipeline.pipeline import Pipeline
7+
from unstructured.ingest.v2.processes.chunker import ChunkerConfig
8+
from unstructured.ingest.v2.processes.connectors.chroma import (
9+
ChromaAccessConfig,
10+
ChromaConnectionConfig,
11+
ChromaUploaderConfig,
12+
ChromaUploadStagerConfig,
13+
)
14+
from unstructured.ingest.v2.processes.connectors.local import (
15+
LocalConnectionConfig,
16+
LocalDownloaderConfig,
17+
LocalIndexerConfig,
18+
)
19+
from unstructured.ingest.v2.processes.embedder import EmbedderConfig
20+
from unstructured.ingest.v2.processes.partitioner import PartitionerConfig
21+
22+
base_path = Path(__file__).parent.parent.parent.parent.parent
23+
docs_path = base_path / "example-docs"
24+
work_dir = base_path / "tmp_ingest"
25+
output_path = work_dir / "output"
26+
download_path = work_dir / "download"
27+
28+
if __name__ == "__main__":
29+
logger.info(f"Writing all content in: {work_dir.resolve()}")
30+
Pipeline.from_configs(
31+
context=ProcessorConfig(work_dir=str(work_dir.resolve())),
32+
indexer_config=LocalIndexerConfig(input_path=str(docs_path.resolve()) + "/multisimple/"),
33+
downloader_config=LocalDownloaderConfig(download_dir=download_path),
34+
source_connection_config=LocalConnectionConfig(),
35+
partitioner_config=PartitionerConfig(strategy="fast"),
36+
chunker_config=ChunkerConfig(
37+
chunking_strategy="by_title",
38+
chunk_include_orig_elements=False,
39+
chunk_max_characters=1500,
40+
chunk_multipage_sections=True,
41+
),
42+
embedder_config=EmbedderConfig(embedding_provider="langchain-huggingface"),
43+
destination_connection_config=ChromaConnectionConfig(
44+
access_config=ChromaAccessConfig(settings=None, headers=None),
45+
host="localhost",
46+
port=8047,
47+
collection_name=f"test-collection-{random.randint(1000,9999)}",
48+
tenant="default_tenant",
49+
database="default_database",
50+
),
51+
stager_config=ChromaUploadStagerConfig(),
52+
uploader_config=ChromaUploaderConfig(batch_size=10),
53+
).run()

Diff for: unstructured/ingest/v2/examples/example_local.py

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
from pathlib import Path
2+
3+
from unstructured.ingest.v2.interfaces import ProcessorConfig
4+
from unstructured.ingest.v2.logger import logger
5+
from unstructured.ingest.v2.pipeline.pipeline import Pipeline
6+
from unstructured.ingest.v2.processes.chunker import ChunkerConfig
7+
from unstructured.ingest.v2.processes.connectors.local import (
8+
LocalConnectionConfig,
9+
LocalDownloaderConfig,
10+
LocalIndexerConfig,
11+
LocalUploaderConfig,
12+
)
13+
from unstructured.ingest.v2.processes.embedder import EmbedderConfig
14+
from unstructured.ingest.v2.processes.partitioner import PartitionerConfig
15+
16+
base_path = Path(__file__).parent.parent.parent.parent.parent
17+
docs_path = base_path / "example-docs"
18+
work_dir = base_path / "tmp_ingest"
19+
output_path = work_dir / "output"
20+
download_path = work_dir / "download"
21+
22+
if __name__ == "__main__":
23+
logger.info(f"Writing all content in: {work_dir.resolve()}")
24+
Pipeline.from_configs(
25+
context=ProcessorConfig(work_dir=str(work_dir.resolve())),
26+
indexer_config=LocalIndexerConfig(
27+
input_path=str(docs_path.resolve()) + "/book-war-and-peace-1p.txt"
28+
),
29+
downloader_config=LocalDownloaderConfig(download_dir=download_path),
30+
source_connection_config=LocalConnectionConfig(),
31+
partitioner_config=PartitionerConfig(strategy="fast"),
32+
chunker_config=ChunkerConfig(chunking_strategy="by_title"),
33+
embedder_config=EmbedderConfig(embedding_provider="langchain-huggingface"),
34+
uploader_config=LocalUploaderConfig(output_dir=str(output_path.resolve())),
35+
).run()

Diff for: unstructured/ingest/v2/examples/example_s3.py

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
from pathlib import Path
2+
3+
from unstructured.ingest.v2.interfaces import ProcessorConfig
4+
from unstructured.ingest.v2.logger import logger
5+
from unstructured.ingest.v2.pipeline.pipeline import Pipeline
6+
from unstructured.ingest.v2.processes.chunker import ChunkerConfig
7+
from unstructured.ingest.v2.processes.connectors.fsspec.s3 import (
8+
S3ConnectionConfig,
9+
S3DownloaderConfig,
10+
S3IndexerConfig,
11+
)
12+
from unstructured.ingest.v2.processes.connectors.local import (
13+
LocalUploaderConfig,
14+
)
15+
from unstructured.ingest.v2.processes.embedder import EmbedderConfig
16+
from unstructured.ingest.v2.processes.partitioner import PartitionerConfig
17+
18+
base_path = Path(__file__).parent.parent.parent.parent.parent
19+
docs_path = base_path / "example-docs"
20+
work_dir = base_path / "tmp_ingest"
21+
output_path = work_dir / "output"
22+
download_path = work_dir / "download"
23+
24+
if __name__ == "__main__":
25+
logger.info(f"Writing all content in: {work_dir.resolve()}")
26+
Pipeline.from_configs(
27+
context=ProcessorConfig(work_dir=str(work_dir.resolve())),
28+
indexer_config=S3IndexerConfig(remote_url="s3://utic-dev-tech-fixtures/small-pdf-set/"),
29+
downloader_config=S3DownloaderConfig(download_dir=download_path),
30+
source_connection_config=S3ConnectionConfig(anonymous=True),
31+
partitioner_config=PartitionerConfig(strategy="fast"),
32+
chunker_config=ChunkerConfig(chunking_strategy="by_title"),
33+
embedder_config=EmbedderConfig(embedding_provider="langchain-huggingface"),
34+
uploader_config=LocalUploaderConfig(output_dir=str(output_path.resolve())),
35+
).run()

Diff for: unstructured/ingest/v2/examples/example_weaviate.py

+44
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
from pathlib import Path
2+
3+
from unstructured.ingest.v2.interfaces import ProcessorConfig
4+
from unstructured.ingest.v2.logger import logger
5+
from unstructured.ingest.v2.pipeline.pipeline import Pipeline
6+
from unstructured.ingest.v2.processes.chunker import ChunkerConfig
7+
from unstructured.ingest.v2.processes.connectors.local import (
8+
LocalConnectionConfig,
9+
LocalDownloaderConfig,
10+
LocalIndexerConfig,
11+
)
12+
from unstructured.ingest.v2.processes.connectors.weaviate import (
13+
WeaviateConnectionConfig,
14+
WeaviateUploaderConfig,
15+
WeaviateUploadStagerConfig,
16+
)
17+
from unstructured.ingest.v2.processes.embedder import EmbedderConfig
18+
from unstructured.ingest.v2.processes.partitioner import PartitionerConfig
19+
20+
base_path = Path(__file__).parent.parent.parent.parent.parent
21+
docs_path = base_path / "example-docs"
22+
work_dir = base_path / "tmp_ingest"
23+
output_path = work_dir / "output"
24+
download_path = work_dir / "download"
25+
26+
if __name__ == "__main__":
27+
logger.info(f"Writing all content in: {work_dir.resolve()}")
28+
Pipeline.from_configs(
29+
context=ProcessorConfig(work_dir=str(work_dir.resolve())),
30+
indexer_config=LocalIndexerConfig(input_path=str(docs_path.resolve()) + "/multisimple/"),
31+
downloader_config=LocalDownloaderConfig(download_dir=download_path),
32+
source_connection_config=LocalConnectionConfig(),
33+
partitioner_config=PartitionerConfig(strategy="fast"),
34+
chunker_config=ChunkerConfig(chunking_strategy="by_title"),
35+
embedder_config=EmbedderConfig(embedding_provider="langchain-huggingface"),
36+
destination_connection_config=WeaviateConnectionConfig(
37+
host_url="http://localhost:8080",
38+
class_name="elements",
39+
access_config=None,
40+
anonymous=True,
41+
),
42+
stager_config=WeaviateUploadStagerConfig(),
43+
uploader_config=WeaviateUploaderConfig(batch_size=10),
44+
).run()

0 commit comments

Comments
 (0)