|
| 1 | +import json |
| 2 | +from io import BytesIO |
| 3 | +from typing import Any, Dict, List |
| 4 | + |
| 5 | +import httpx |
| 6 | + |
| 7 | +from srdt_analysis.albert import AlbertBase |
| 8 | +from srdt_analysis.constants import ALBERT_ENDPOINT |
| 9 | +from srdt_analysis.models import ChunkDataList, DocumentData |
| 10 | + |
| 11 | + |
| 12 | +class Collections(AlbertBase): |
| 13 | + def _create(self, collection_name: str, model: str) -> str: |
| 14 | + payload = {"name": collection_name, "model": model} |
| 15 | + response = httpx.post( |
| 16 | + f"{ALBERT_ENDPOINT}/v1/collections", headers=self.headers, json=payload |
| 17 | + ) |
| 18 | + return response.json()["id"] |
| 19 | + |
| 20 | + def create(self, collection_name: str, model: str) -> str: |
| 21 | + collections: List[Dict[str, Any]] = self.list() |
| 22 | + for collection in collections: |
| 23 | + if collection["name"] == collection_name: |
| 24 | + self.delete(collection["id"]) |
| 25 | + return self._create(collection_name, model) |
| 26 | + |
| 27 | + def list(self) -> List[Dict[str, Any]]: |
| 28 | + response = httpx.get(f"{ALBERT_ENDPOINT}/v1/collections", headers=self.headers) |
| 29 | + return response.json()["data"] |
| 30 | + |
| 31 | + def delete(self, id_collection: str): |
| 32 | + response = httpx.delete( |
| 33 | + f"{ALBERT_ENDPOINT}/v1/collections/{id_collection}", headers=self.headers |
| 34 | + ) |
| 35 | + response.raise_for_status() |
| 36 | + |
| 37 | + def delete_all(self, collection_name) -> None: |
| 38 | + collections = self.list() |
| 39 | + for collection in collections: |
| 40 | + if collection["name"] == collection_name: |
| 41 | + self.delete(collection["id"]) |
| 42 | + return None |
| 43 | + |
| 44 | + def search( |
| 45 | + self, |
| 46 | + prompt: str, |
| 47 | + id_collections: List[str], |
| 48 | + k: int = 5, |
| 49 | + score_threshold: float = 0, |
| 50 | + ) -> ChunkDataList: |
| 51 | + response = httpx.post( |
| 52 | + f"{ALBERT_ENDPOINT}/v1/search", |
| 53 | + headers=self.headers, |
| 54 | + json={ |
| 55 | + "prompt": prompt, |
| 56 | + "collections": id_collections, |
| 57 | + "k": k, |
| 58 | + "score_threshold": score_threshold, |
| 59 | + }, |
| 60 | + ) |
| 61 | + return response.json() |
| 62 | + |
| 63 | + def upload( |
| 64 | + self, |
| 65 | + data: List[DocumentData], |
| 66 | + id_collection: str, |
| 67 | + ) -> None: |
| 68 | + result = [] |
| 69 | + for dt in data: |
| 70 | + dt: DocumentData |
| 71 | + chunks = dt["content_chunked"] |
| 72 | + for chunk in chunks: |
| 73 | + result.append( |
| 74 | + { |
| 75 | + "text": chunk.page_content, |
| 76 | + "title": dt["title"], |
| 77 | + "metadata": { |
| 78 | + "cdtn_id": dt["cdtn_id"], |
| 79 | + "structure_du_chunk": chunk.metadata, |
| 80 | + "url": dt["url"], |
| 81 | + }, |
| 82 | + } |
| 83 | + ) |
| 84 | + |
| 85 | + file_content = json.dumps(result).encode("utf-8") |
| 86 | + |
| 87 | + files = { |
| 88 | + "file": ( |
| 89 | + "content.json", |
| 90 | + BytesIO(file_content), |
| 91 | + "multipart/form-data", |
| 92 | + ) |
| 93 | + } |
| 94 | + |
| 95 | + request_data = {"request": '{"collection": "%s"}' % id_collection} |
| 96 | + response = httpx.post( |
| 97 | + f"{ALBERT_ENDPOINT}/v1/files", |
| 98 | + headers=self.headers, |
| 99 | + files=files, |
| 100 | + data=request_data, |
| 101 | + ) |
| 102 | + |
| 103 | + response.raise_for_status() |
| 104 | + return |
0 commit comments