Merge pull request AmadeusITGroup#69 from MarouaneBenabdelkader/feat/bedrock-titan-embedding

dpomian · web-flow · commit d98828677d39 · 2026-04-28T11:03:53.000+02:00
feat: add bedrock-titan-embedding skill for AWS Bedrock Titan V2
diff --git a/docs/readme/indexer-skills.md b/docs/readme/indexer-skills.md
@@ -265,6 +265,22 @@ Generates embeddings from text using `llama_index` library.
     type: embedding
     name: llama-fastembed
 ```
+
+### AWS Bedrock Titan
+Generates embeddings using AWS Bedrock's Titan Embed Text v2 model. AWS credentials are resolved from the standard boto3 credential chain (env vars `AWS_ACCESS_KEY_ID`/`AWS_SECRET_ACCESS_KEY`/`AWS_SESSION_TOKEN`, `AWS_PROFILE`, IAM role, `~/.aws/credentials`, etc.) — do not put them in the YAML.
+
+```yaml
+- skill: &BedrockTitanEmbedding
+    type: embedding
+    name: bedrock-titan-embedding
+    params:
+      region: us-east-1                         # Optional: falls back to AWS_REGION / default profile region
+      model_id: amazon.titan-embed-text-v2:0    # Optional (default)
+      dimensions: 1024                          # Optional: 256 | 512 | 1024 (default 1024)
+      normalize: true                           # Optional (default true)
+      max_retries: 3                            # Optional (default 3)
+      retry_backoff: 2                          # Optional seconds, linear per attempt (default 2)
+```
 </details>
 
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -37,6 +37,7 @@ dependencies = [
     "unstructured>=0.14.8",
     "faiss-cpu>=1.11.0",
     "langchain_community>=0.3.18",
+    "boto3>=1.34.0",
 ]
 
 [project.scripts]
diff --git a/src/docs2vecs/subcommands/indexer/config/config_schema.yaml b/src/docs2vecs/subcommands/indexer/config/config_schema.yaml
@@ -237,6 +237,28 @@ definitions:
                     type: string
                   deployment_name:
                     type: string
+              # BedrockTitanEmbeddingSkill params
+              region:
+                type: string
+                required: False
+              model_id:
+                type: string
+                required: False
+              dimensions:
+                type: integer
+                required: False
+                allowed: [256, 512, 1024]
+              normalize:
+                type: boolean
+                required: False
+              max_retries:
+                type: integer
+                required: False
+                min: 1
+              retry_backoff:
+                type: integer
+                required: False
+                min: 1
 
       skillset:
         type: list
diff --git a/src/docs2vecs/subcommands/indexer/skills/__init__.py b/src/docs2vecs/subcommands/indexer/skills/__init__.py
@@ -16,6 +16,7 @@
 from .teams_qna_loader_skill import TeamsQnALoaderSkill
 from .confluence_faq_splitter_skill import ConfluenceFAQSplitter
 from .json_writer_skill import JSONWriterSkill
+from .bedrock_titan_embedding_skill import BedrockTitanEmbeddingSkill
 
 
 __all__ = [
@@ -37,4 +38,5 @@
     "TeamsQnALoaderSkill",
     "ConfluenceFAQSplitter",
     "JSONWriterSkill",
+    "BedrockTitanEmbeddingSkill",
 ]
diff --git a/src/docs2vecs/subcommands/indexer/skills/bedrock_titan_embedding_skill.py b/src/docs2vecs/subcommands/indexer/skills/bedrock_titan_embedding_skill.py
@@ -0,0 +1,83 @@
+import json
+import time
+from typing import List, Optional
+
+import boto3
+
+from docs2vecs.subcommands.indexer.config.config import Config
+from docs2vecs.subcommands.indexer.document.document import Document
+from docs2vecs.subcommands.indexer.skills.skill import IndexerSkill
+
+
+class BedrockTitanEmbeddingSkill(IndexerSkill):
+    DEFAULT_MODEL_ID = "amazon.titan-embed-text-v2:0"
+    DEFAULT_DIMENSIONS = 1024
+    DEFAULT_MAX_RETRIES = 3
+    DEFAULT_RETRY_BACKOFF = 2
+
+    def __init__(self, config: dict, global_config: Config):
+        super().__init__(config, global_config)
+        self._model_id = self._config.get("model_id", self.DEFAULT_MODEL_ID)
+        self._dimensions = self._config.get("dimensions", self.DEFAULT_DIMENSIONS)
+        self._normalize = self._config.get("normalize", True)
+        self._max_retries = self._config.get("max_retries", self.DEFAULT_MAX_RETRIES)
+        self._retry_backoff = self._config.get("retry_backoff", self.DEFAULT_RETRY_BACKOFF)
+        self._client = boto3.client(
+            "bedrock-runtime",
+            region_name=self._config.get("region"),
+        )
+
+    def _embed_text(self, content: str, chunk_id=None):
+        self.logger.debug(
+            f"Requesting Bedrock embedding for chunk_id={chunk_id}, content_length={len(content)}"
+        )
+        body = json.dumps(
+            {
+                "inputText": content,
+                "dimensions": self._dimensions,
+                "normalize": self._normalize,
+            }
+        )
+        for attempt in range(self._max_retries):
+            try:
+                resp = self._client.invoke_model(
+                    modelId=self._model_id,
+                    body=body,
+                    contentType="application/json",
+                    accept="application/json",
+                )
+                embedding = json.loads(resp["body"].read())["embedding"]
+                self.logger.debug(
+                    f"Successfully received embedding for chunk_id={chunk_id}, embedding_dim={len(embedding) if embedding else 0}"
+                )
+                return embedding
+            except Exception as exc:
+                if attempt == self._max_retries - 1:
+                    raise
+                wait = self._retry_backoff * (attempt + 1)
+                self.logger.warning(
+                    f"Bedrock call failed (attempt {attempt + 1}/{self._max_retries}): {exc} - retrying in {wait}s"
+                )
+                time.sleep(wait)
+
+    def run(self, input: Optional[List[Document]] = None) -> Optional[List[Document]]:
+        self.logger.info(
+            f"Running Bedrock Titan Embedding Skill with model_id: {self._model_id}..."
+        )
+
+        docs_count = len(input)
+        chunks_count = sum(len(doc.chunks) for doc in input)
+
+        self.logger.info(
+            f"Processing a total of documents: {docs_count}. Total number of chunks: {chunks_count}"
+        )
+
+        for doc in input:
+            self.logger.debug(f"Processing document: {doc.filename}")
+            for chunk in doc.chunks:
+                self.logger.debug(f"Creating embedding for chunk: {chunk.chunk_id}")
+                chunk.embedding = [] if not chunk.content else self._embed_text(
+                    chunk.content, chunk_id=chunk.chunk_id
+                )
+
+        return input
diff --git a/src/docs2vecs/subcommands/indexer/skills/factory.py b/src/docs2vecs/subcommands/indexer/skills/factory.py
@@ -19,6 +19,7 @@
 from docs2vecs.subcommands.indexer.skills.confluence_faq_splitter_skill import ConfluenceFAQSplitter
 from docs2vecs.subcommands.indexer.skills.teams_qna_loader_skill import TeamsQnALoaderSkill
 from docs2vecs.subcommands.indexer.skills.json_writer_skill import JSONWriterSkill
+from docs2vecs.subcommands.indexer.skills.bedrock_titan_embedding_skill import BedrockTitanEmbeddingSkill
 
 
 class SkillType(StrEnum):
@@ -60,6 +61,7 @@ class AvailableSkillName(StrEnum):
     # embeddings
     AZ_ADA002_EMBEDDING = "azure-ada002-embedding"
     LLAMA_FASTEMBED = "llama-fastembed"
+    BEDROCK_TITAN_EMBEDDING = "bedrock-titan-embedding"
 
     # web loaders
     JIRA_LOADER = "jira-loader"
@@ -81,6 +83,7 @@ class AvailableSkillName(StrEnum):
     SkillType.EMBEDDING: {
         AvailableSkillName.AZ_ADA002_EMBEDDING: AzureAda002EmbeddingSkill,
         AvailableSkillName.LLAMA_FASTEMBED: LlamaFastembedEmbeddingSkill,
+        AvailableSkillName.BEDROCK_TITAN_EMBEDDING: BedrockTitanEmbeddingSkill,
     },
     SkillType.VECTOR_STORE: {
         AvailableSkillName.AZ_AISearch: AzureVectorStoreSkill,

Original file line number	Diff line number	Diff line change
`@@ -37,6 +37,7 @@ dependencies = [`
`37`	`37`	`"unstructured>=0.14.8",`
`38`	`38`	`"faiss-cpu>=1.11.0",`
`39`	`39`	`"langchain_community>=0.3.18",`
	`40`	`+ "boto3>=1.34.0",`
`40`	`41`	`]`
`41`	`42`
`42`	`43`	`[project.scripts]`