diff --git a/docs/readme/indexer-skills.md b/docs/readme/indexer-skills.md index 4d53c56..ee8b2b2 100644 --- a/docs/readme/indexer-skills.md +++ b/docs/readme/indexer-skills.md @@ -265,6 +265,22 @@ Generates embeddings from text using `llama_index` library. type: embedding name: llama-fastembed ``` + +### AWS Bedrock Titan +Generates embeddings using AWS Bedrock's Titan Embed Text v2 model. AWS credentials are resolved from the standard boto3 credential chain (env vars `AWS_ACCESS_KEY_ID`/`AWS_SECRET_ACCESS_KEY`/`AWS_SESSION_TOKEN`, `AWS_PROFILE`, IAM role, `~/.aws/credentials`, etc.) — do not put them in the YAML. + +```yaml +- skill: &BedrockTitanEmbedding + type: embedding + name: bedrock-titan-embedding + params: + region: us-east-1 # Optional: falls back to AWS_REGION / default profile region + model_id: amazon.titan-embed-text-v2:0 # Optional (default) + dimensions: 1024 # Optional: 256 | 512 | 1024 (default 1024) + normalize: true # Optional (default true) + max_retries: 3 # Optional (default 3) + retry_backoff: 2 # Optional seconds, linear per attempt (default 2) +``` diff --git a/pyproject.toml b/pyproject.toml index 0ec28c8..b2f4cbc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,6 +37,7 @@ dependencies = [ "unstructured>=0.14.8", "faiss-cpu>=1.11.0", "langchain_community>=0.3.18", + "boto3>=1.34.0", ] [project.scripts] diff --git a/src/docs2vecs/subcommands/indexer/config/config_schema.yaml b/src/docs2vecs/subcommands/indexer/config/config_schema.yaml index ede449f..47bda4d 100644 --- a/src/docs2vecs/subcommands/indexer/config/config_schema.yaml +++ b/src/docs2vecs/subcommands/indexer/config/config_schema.yaml @@ -237,6 +237,28 @@ definitions: type: string deployment_name: type: string + # BedrockTitanEmbeddingSkill params + region: + type: string + required: False + model_id: + type: string + required: False + dimensions: + type: integer + required: False + allowed: [256, 512, 1024] + normalize: + type: boolean + required: False + max_retries: + type: integer + required: False + min: 1 + retry_backoff: + type: integer + required: False + min: 1 skillset: type: list diff --git a/src/docs2vecs/subcommands/indexer/skills/__init__.py b/src/docs2vecs/subcommands/indexer/skills/__init__.py index 1305ca3..8b732d3 100644 --- a/src/docs2vecs/subcommands/indexer/skills/__init__.py +++ b/src/docs2vecs/subcommands/indexer/skills/__init__.py @@ -16,6 +16,7 @@ from .teams_qna_loader_skill import TeamsQnALoaderSkill from .confluence_faq_splitter_skill import ConfluenceFAQSplitter from .json_writer_skill import JSONWriterSkill +from .bedrock_titan_embedding_skill import BedrockTitanEmbeddingSkill __all__ = [ @@ -37,4 +38,5 @@ "TeamsQnALoaderSkill", "ConfluenceFAQSplitter", "JSONWriterSkill", + "BedrockTitanEmbeddingSkill", ] diff --git a/src/docs2vecs/subcommands/indexer/skills/bedrock_titan_embedding_skill.py b/src/docs2vecs/subcommands/indexer/skills/bedrock_titan_embedding_skill.py new file mode 100644 index 0000000..bea5642 --- /dev/null +++ b/src/docs2vecs/subcommands/indexer/skills/bedrock_titan_embedding_skill.py @@ -0,0 +1,83 @@ +import json +import time +from typing import List, Optional + +import boto3 + +from docs2vecs.subcommands.indexer.config.config import Config +from docs2vecs.subcommands.indexer.document.document import Document +from docs2vecs.subcommands.indexer.skills.skill import IndexerSkill + + +class BedrockTitanEmbeddingSkill(IndexerSkill): + DEFAULT_MODEL_ID = "amazon.titan-embed-text-v2:0" + DEFAULT_DIMENSIONS = 1024 + DEFAULT_MAX_RETRIES = 3 + DEFAULT_RETRY_BACKOFF = 2 + + def __init__(self, config: dict, global_config: Config): + super().__init__(config, global_config) + self._model_id = self._config.get("model_id", self.DEFAULT_MODEL_ID) + self._dimensions = self._config.get("dimensions", self.DEFAULT_DIMENSIONS) + self._normalize = self._config.get("normalize", True) + self._max_retries = self._config.get("max_retries", self.DEFAULT_MAX_RETRIES) + self._retry_backoff = self._config.get("retry_backoff", self.DEFAULT_RETRY_BACKOFF) + self._client = boto3.client( + "bedrock-runtime", + region_name=self._config.get("region"), + ) + + def _embed_text(self, content: str, chunk_id=None): + self.logger.debug( + f"Requesting Bedrock embedding for chunk_id={chunk_id}, content_length={len(content)}" + ) + body = json.dumps( + { + "inputText": content, + "dimensions": self._dimensions, + "normalize": self._normalize, + } + ) + for attempt in range(self._max_retries): + try: + resp = self._client.invoke_model( + modelId=self._model_id, + body=body, + contentType="application/json", + accept="application/json", + ) + embedding = json.loads(resp["body"].read())["embedding"] + self.logger.debug( + f"Successfully received embedding for chunk_id={chunk_id}, embedding_dim={len(embedding) if embedding else 0}" + ) + return embedding + except Exception as exc: + if attempt == self._max_retries - 1: + raise + wait = self._retry_backoff * (attempt + 1) + self.logger.warning( + f"Bedrock call failed (attempt {attempt + 1}/{self._max_retries}): {exc} - retrying in {wait}s" + ) + time.sleep(wait) + + def run(self, input: Optional[List[Document]] = None) -> Optional[List[Document]]: + self.logger.info( + f"Running Bedrock Titan Embedding Skill with model_id: {self._model_id}..." + ) + + docs_count = len(input) + chunks_count = sum(len(doc.chunks) for doc in input) + + self.logger.info( + f"Processing a total of documents: {docs_count}. Total number of chunks: {chunks_count}" + ) + + for doc in input: + self.logger.debug(f"Processing document: {doc.filename}") + for chunk in doc.chunks: + self.logger.debug(f"Creating embedding for chunk: {chunk.chunk_id}") + chunk.embedding = [] if not chunk.content else self._embed_text( + chunk.content, chunk_id=chunk.chunk_id + ) + + return input diff --git a/src/docs2vecs/subcommands/indexer/skills/factory.py b/src/docs2vecs/subcommands/indexer/skills/factory.py index 71d80cd..21aad42 100644 --- a/src/docs2vecs/subcommands/indexer/skills/factory.py +++ b/src/docs2vecs/subcommands/indexer/skills/factory.py @@ -19,6 +19,7 @@ from docs2vecs.subcommands.indexer.skills.confluence_faq_splitter_skill import ConfluenceFAQSplitter from docs2vecs.subcommands.indexer.skills.teams_qna_loader_skill import TeamsQnALoaderSkill from docs2vecs.subcommands.indexer.skills.json_writer_skill import JSONWriterSkill +from docs2vecs.subcommands.indexer.skills.bedrock_titan_embedding_skill import BedrockTitanEmbeddingSkill class SkillType(StrEnum): @@ -60,6 +61,7 @@ class AvailableSkillName(StrEnum): # embeddings AZ_ADA002_EMBEDDING = "azure-ada002-embedding" LLAMA_FASTEMBED = "llama-fastembed" + BEDROCK_TITAN_EMBEDDING = "bedrock-titan-embedding" # web loaders JIRA_LOADER = "jira-loader" @@ -81,6 +83,7 @@ class AvailableSkillName(StrEnum): SkillType.EMBEDDING: { AvailableSkillName.AZ_ADA002_EMBEDDING: AzureAda002EmbeddingSkill, AvailableSkillName.LLAMA_FASTEMBED: LlamaFastembedEmbeddingSkill, + AvailableSkillName.BEDROCK_TITAN_EMBEDDING: BedrockTitanEmbeddingSkill, }, SkillType.VECTOR_STORE: { AvailableSkillName.AZ_AISearch: AzureVectorStoreSkill,