etalab-ia
diff --git a/‎api/clients/vector_store/_elasticsearchvectorstoreclient.py‎
Lines changed: 299 additions & 0 deletions b/‎api/clients/vector_store/_elasticsearchvectorstoreclient.py‎
Lines changed: 299 additions & 0 deletions
diff --git a/‎api/endpoints/chunks.py‎
Lines changed: 38 additions & 3 deletions b/‎api/endpoints/chunks.py‎
Lines changed: 38 additions & 3 deletions
@@ -0,0 +1,299 @@
+import hashlib
+
+from elasticsearch import AsyncElasticsearch, helpers
+
+from api.clients.vector_store._basevectorstoreclient import BaseVectorStoreClient
+from api.schemas.chunks import Chunk
+from api.schemas.search import Search, SearchMethod
+
+
+class ElasticsearchVectorStoreClient(BaseVectorStoreClient, AsyncElasticsearch):
+    default_method = SearchMethod.HYBRID
+
+    def __init__(self, *args, **kwargs):
+        kwargs.pop("type", None)  # remove type from kwargs to avoid passing it to the super class
+        self.number_of_shards = kwargs.pop("number_of_shards", 1)  # remove number_of_shards from kwargs to avoid passing it to the super class
+        self.number_of_replicas = kwargs.pop("number_of_replicas", 1)  # remove number_of_replicas from kwargs to avoid passing it to the super class
+        AsyncElasticsearch.__init__(self, *args, **kwargs)
+
+    async def check(self) -> bool:
+        try:
+            await self.ping()
+            return True
+        except Exception:
+            return False
+
+    async def close(self) -> None:
+        await super(AsyncElasticsearch, self).transport.close()
+
+    async def create_collection(self, collection_id: int, vector_size: int) -> None:
+        if await self.indices.exists(index=str(collection_id)):
+            return
+
+        settings = {
+            "number_of_shards": self.number_of_shards,
+            "number_of_replicas": self.number_of_replicas,
+            "similarity": {"default": {"type": "BM25"}},
+            "analysis": {
+                "filter": {
+                    "french_stop": {"type": "stop", "stopwords": "_french_"},
+                    "french_stemmer": {"type": "stemmer", "language": "light_french"},
+                },
+                "analyzer": {
+                    "french_analyzer": {
+                        "tokenizer": "standard",
+                        "filter": ["lowercase", "french_stop", "french_stemmer"],
+                    }
+                },
+            },
+        }
+
+        mappings = {
+            "dynamic_templates": [
+                {"metadata_objects_disabled": {"path_match": "metadata.*", "match_mapping_type": "object", "mapping": {"enabled": False}}},
+                {
+                    "metadata_dates_by_name": {
+                        "path_match": "metadata.*",
+                        "match_pattern": "regex",
+                        "match": "(?i).*(_at|_date|date)$",
+                        "mapping": {
+                            "type": "date",
+                            "ignore_malformed": True,
+                            "format": "strict_date_optional_time||strict_date_time||yyyy-MM-dd'T'HH:mm:ssZ||epoch_millis",
+                        },
+                    }
+                },
+                {"metadata_bools": {"path_match": "metadata.*", "match_mapping_type": "boolean", "mapping": {"type": "boolean"}}},
+                {
+                    "metadata_numbers_long": {
+                        "path_match": "metadata.*",
+                        "match_mapping_type": "long",
+                        "mapping": {"type": "long", "ignore_malformed": True, "coerce": True},
+                    }
+                },
+                {
+                    "metadata_numbers_double": {
+                        "path_match": "metadata.*",
+                        "match_mapping_type": "double",
+                        "mapping": {"type": "double", "ignore_malformed": True, "coerce": True},
+                    }
+                },
+                {
+                    "metadata_strings": {
+                        "path_match": "metadata.*",
+                        "match_mapping_type": "string",
+                        "mapping": {"type": "keyword", "ignore_above": 1024},
+                    }
+                },
+            ],
+            "date_detection": False,
+            "numeric_detection": False,
+            "properties": {
+                "id": {"type": "integer"},
+                "embedding": {"type": "dense_vector", "dims": vector_size},
+                "content": {"type": "text", "analyzer": "french_analyzer"},
+                "metadata": {"type": "object", "dynamic": True},
+            },
+        }
+
+        await self.indices.create(index=str(collection_id), mappings=mappings, settings=settings)
+
+    async def delete_collection(self, collection_id: int) -> None:
+        if not await self.indices.exists(index=str(collection_id)):
+            return
+
+        await self.indices.delete(index=str(collection_id))
+
+    async def get_collections(self) -> list[int]:
+        collections = await self.indices.get_alias()
+        return [int(collection) for collection in collections]
+
+    async def get_chunk_count(self, collection_id: int, document_id: int) -> int | None:
+        try:
+            body = {"query": {"match": {"metadata.document_id": document_id}}}
+            result = await AsyncElasticsearch.count(self, index=str(collection_id), body=body)
+            return result["count"]
+        except Exception:
+            return None
+
+    async def delete_document(self, collection_id: int, document_id: int) -> None:
+        body = {"query": {"match": {"metadata.document_id": document_id}}}
+        await AsyncElasticsearch.delete_by_query(self, index=str(collection_id), body=body)
+        await self.indices.refresh(index=str(collection_id))
+
+    async def get_chunks(self, collection_id: int, document_id: int, offset: int = 0, limit: int = 10, chunk_id: int | None = None) -> list[Chunk]:
+        body = {"query": {"bool": {"must": [{"match": {"metadata.document_id": document_id}}]}}, "_source": ["id", "content", "metadata"]}
+        if chunk_id is not None:
+            body["query"]["bool"]["must"].append({"term": {"id": chunk_id}})
+
+        results = await AsyncElasticsearch.search(self, index=str(collection_id), body=body, from_=offset, size=limit)
+        chunks = []
+        for hit in results["hits"]["hits"]:
+            chunks.append(
+                Chunk(
+                    id=hit["_source"]["id"],
+                    document=document_id,
+                    collection=collection_id,
+                    content=hit["_source"]["content"],
+                    metadata=hit["_source"]["metadata"],
+                )
+            )
+        return chunks
+
+    async def upsert(self, collection_id: int, chunks: list[Chunk], embeddings: list[list[float]]) -> None:
+        actions = [
+            {
+                "_index": str(collection_id),
+                "_id": hashlib.sha256(f"{chunk.collection}|{chunk.document}|{chunk.id}".encode()).hexdigest(),
+                "_source": {
+                    "id": chunk.id,
+                    "content": chunk.content,
+                    "embedding": embedding,
+                    "metadata": chunk.metadata,
+                },
+            }
+            for chunk, embedding in zip(chunks, embeddings)
+        ]
+
+        await helpers.async_bulk(client=self, actions=actions, index=collection_id)
+        await self.indices.refresh(index=str(collection_id))
+
+    async def search(
+        self,
+        method: SearchMethod,
+        collection_ids: list[int],
+        query_prompt: str,
+        query_vector: list[float],
+        limit: int,
+        offset: int,
+        rff_k: int | None = 20,
+        score_threshold: float = 0.0,
+    ) -> list[Search]:
+        if method == SearchMethod.SEMANTIC:
+            searches = await self._semantic_search(
+                query_vector=query_vector, collection_ids=collection_ids, limit=limit, offset=offset, score_threshold=score_threshold
+            )
+
+        elif method == SearchMethod.LEXICAL:
+            searches = await self._lexical_search(
+                query_prompt=query_prompt, collection_ids=collection_ids, limit=limit, offset=offset, score_threshold=score_threshold
+            )
+
+        else:  # method == SearchMethod.HYBRID
+            searches = await self._hybrid_search(
+                query_prompt=query_prompt, query_vector=query_vector, collection_ids=collection_ids, limit=limit, offset=offset, rff_k=rff_k
+            )
+
+        return searches
+
+    async def _lexical_search(
+        self, query_prompt: str, collection_ids: list[int], limit: int, offset: int, score_threshold: float = 0.0
+    ) -> list[Search]:
+        collection_ids = [str(x) for x in collection_ids]
+        fuzziness = {"fuzziness": "AUTO"} if len(query_prompt.split()) < 25 else {}
+        body = {
+            "query": {"multi_match": {"query": query_prompt, **fuzziness}},
+            "size": limit,
+            "from": offset,
+            "_source": {"excludes": ["embedding"]},
+        }
+        results = await AsyncElasticsearch.search(self, index=collection_ids, body=body)
+        hits = [hit for hit in results["hits"]["hits"] if hit]
+        searches = [
+            Search(
+                method=SearchMethod.LEXICAL.value,
+                score=hit["_score"],
+                chunk=Chunk(id=hit["_source"]["id"], content=hit["_source"]["content"], metadata=hit["_source"]["metadata"]),
+            )
+            for hit in hits
+        ]
+
+        searches = [search for search in searches if search.score >= score_threshold]
+        searches = sorted(searches, key=lambda x: x.score, reverse=True)[:limit]
+
+        return searches
+
+    async def _semantic_search(
+        self, query_vector: list[float], collection_ids: list[int], limit: int, offset: int, score_threshold: float = 0.0
+    ) -> list[Search]:
+        collection_ids = [str(x) for x in collection_ids]
+        body = {
+            "knn": {"field": "embedding", "query_vector": query_vector, "k": limit, "num_candidates": max(limit * 10, 100)},
+            "size": limit,
+            "from": offset,
+            "_source": {"excludes": ["embedding"]},
+        }
+        results = await AsyncElasticsearch.search(self, index=collection_ids, body=body)
+        hits = [hit for hit in results["hits"]["hits"] if hit]
+        searches = [
+            Search(
+                method=SearchMethod.SEMANTIC.value,
+                score=hit["_score"],
+                chunk=Chunk(id=hit["_source"]["id"], content=hit["_source"]["content"], metadata=hit["_source"]["metadata"]),
+            )
+            for hit in hits
+        ]
+
+        searches = [search for search in searches if search.score >= score_threshold]
+        searches = sorted(searches, key=lambda x: x.score, reverse=True)[:limit]
+
+        return searches
+
+    async def _hybrid_search(
+        self, query_prompt: str, query_vector: list[float], collection_ids: list[int], limit: int, offset: int, rff_k: int, expansion_factor: int = 2
+    ) -> list[Search]:
+        """
+        Hybrid search combines lexical and semantic search results using Reciprocal Rank Fusion (RRF).
+
+        Args:
+            query_prompt (str): The search prompt
+            query_vector (list[float]): The query vector
+            collection_ids (List[int]): The collection ids
+            k (int): The number of results to return
+            rff_k (int): The constant k in the RRF formula
+            expansion_factor (int): The factor that increases the number of results to search in each method before reranking
+
+        Returns:
+            A combined list of searches with updated scores
+        """
+        lexical_searches = await self._lexical_search(
+            query_prompt=query_prompt, collection_ids=collection_ids, limit=int(limit * expansion_factor), offset=offset
+        )
+        semantic_searches = await self._semantic_search(
+            query_vector=query_vector, collection_ids=collection_ids, limit=int(limit * expansion_factor), offset=offset
+        )
+
+        combined_scores = {}
+        search_map = {}
+        for searches in [lexical_searches, semantic_searches]:
+            for rank, search in enumerate(searches):
+                chunk_id = search.chunk.metadata.get("document_id") + search.chunk.id
+                if chunk_id not in combined_scores:
+                    combined_scores[chunk_id] = 0
+                    search_map[chunk_id] = search
+                    search_map[chunk_id].method = SearchMethod.HYBRID
+                combined_scores[chunk_id] += 1 / (rff_k + rank + 1)
+
+        ranked_scores = sorted(combined_scores.items(), key=lambda item: item[1], reverse=True)
+        reranked_searches = []
+        for chunk_id, rrf_score in ranked_scores:
+            search = search_map[chunk_id]
+            search.score = rrf_score
+            reranked_searches.append(search)
+
+        searches = reranked_searches[:limit]
+
+        return searches
+
+    async def last_chunk_id(self, collection_id: int, document_id: int) -> int | None:
+        result = await AsyncElasticsearch.search(
+            self,
+            index=str(collection_id),
+            size=0,
+            query={"match": {"metadata.document_id": document_id}},
+            aggs={"id_max": {"max": {"field": "id"}}},
+        )
+        value = result["aggregations"]["id_max"]["value"]
+        value = int(value) if value is not None else None
+
+        return value
@@ -2,19 +2,54 @@
 
 from elasticsearch import AsyncElasticsearch
 from fastapi import APIRouter, Depends, Path, Query, Request, Security
+from redis.asyncio import Redis as AsyncRedis
 from sqlalchemy.ext.asyncio import AsyncSession
 
 from api.helpers._accesscontroller import AccessController
 from api.helpers._elasticsearchvectorstore import ElasticsearchVectorStore
-from api.schemas.chunks import Chunk, Chunks
+from api.helpers.models import ModelRegistry
+from api.schemas.chunks import Chunk, Chunks, ChunksResponse, CreateChunks
 from api.utils.context import global_context, request_context
-from api.utils.dependencies import get_elasticsearch_client, get_elasticsearch_vector_store, get_postgres_session
-from api.utils.exceptions import ChunkNotFoundException
+from api.utils.dependencies import (
+    get_elasticsearch_client,
+    get_elasticsearch_vector_store,
+    get_model_registry,
+    get_postgres_session,
+    get_redis_client,
+)
+from api.utils.exceptions import ChunkNotFoundException, CollectionNotFoundException
 from api.utils.variables import ENDPOINT__CHUNKS, ROUTER__CHUNKS
 
 router = APIRouter(prefix="/v1", tags=[ROUTER__CHUNKS.title()])
 
 
+@router.post(path=ENDPOINT__CHUNKS, dependencies=[Security(dependency=AccessController())], status_code=201)
+async def create_chunks(
+    request: Request,
+    body: CreateChunks,
+    postgres_session: AsyncSession = Depends(get_postgres_session),
+    redis_client: AsyncRedis = Depends(get_redis_client),
+    model_registry: ModelRegistry = Depends(get_model_registry),
+) -> ChunksResponse:
+    """
+    Fill document with chunks.
+    """
+    if not global_context.document_manager:  # no vector store available
+        raise CollectionNotFoundException()
+
+    chunk_ids = await global_context.document_manager.create_chunks(
+        postgres_session=postgres_session,
+        document_id=body.document,
+        chunks=body.chunks,
+        user_id=request_context.get().user_info.id,
+        redis_client=redis_client,
+        model_registry=model_registry,
+        request_context=request_context,
+    )
+
+    return ChunksResponse(ids=chunk_ids, status_code=201)
+
+
 @router.get(path=ENDPOINT__CHUNKS + "/{document:path}/{chunk:path}", dependencies=[Security(dependency=AccessController())], status_code=200)
 async def get_chunk(
     request: Request,