dataforgoodfr
diff --git a/‎rag_system/Dockerfile‎
Lines changed: 4 additions & 3 deletions b/‎rag_system/Dockerfile‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎rag_system/README.md‎
Lines changed: 131 additions & 5 deletions b/‎rag_system/README.md‎
Lines changed: 131 additions & 5 deletions
diff --git a/‎rag_system/docker-compose.yml‎
Lines changed: 6 additions & 5 deletions b/‎rag_system/docker-compose.yml‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎rag_system/kotaemon/flowsettings.py‎
Lines changed: 1 addition & 1 deletion b/‎rag_system/kotaemon/flowsettings.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎rag_system/kotaemon/libs/kotaemon/kotaemon/storages/docstores/lancedb.py‎
Lines changed: 23 additions & 14 deletions b/‎rag_system/kotaemon/libs/kotaemon/kotaemon/storages/docstores/lancedb.py‎
Lines changed: 23 additions & 14 deletions
diff --git a/‎rag_system/kotaemon/libs/kotaemon/kotaemon/storages/vectorstores/qdrant.py‎
Lines changed: 13 additions & 6 deletions b/‎rag_system/kotaemon/libs/kotaemon/kotaemon/storages/vectorstores/qdrant.py‎
Lines changed: 13 additions & 6 deletions
diff --git a/‎rag_system/kotaemon/libs/ktem/ktem/index/file/index.py‎
Lines changed: 1 addition & 1 deletion b/‎rag_system/kotaemon/libs/ktem/ktem/index/file/index.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎rag_system/kotaemon/libs/ktem/ktem/index/file/pipelines.py‎
Lines changed: 6 additions & 2 deletions b/‎rag_system/kotaemon/libs/ktem/ktem/index/file/pipelines.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎rag_system/pipeline_scripts/.env.example‎ ‎…m/kotaemon_pipeline_scripts/.env.example‎rag_system/pipeline_scripts/.env.example renamed to rag_system/kotaemon_pipeline_scripts/.env.example b/‎rag_system/pipeline_scripts/.env.example‎ ‎…m/kotaemon_pipeline_scripts/.env.example‎rag_system/pipeline_scripts/.env.example renamed to rag_system/kotaemon_pipeline_scripts/.env.example
diff --git a/‎rag_system/kotaemon_pipeline_scripts/.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎rag_system/kotaemon_pipeline_scripts/.gitignore‎
Lines changed: 1 addition & 0 deletions
@@ -30,9 +30,10 @@ RUN bash scripts/download_pdfjs.sh $PDFJS_PREBUILT_DIR
 # Copy project files
 COPY kotaemon/libs /app/libs
 COPY kotaemon/launch.sh /app/launch.sh
-COPY kotaemon/.env.example /app/.env
-COPY flowsettings.py /app/flowsettings.py
-COPY pipeline_scripts /app/pipeline_scripts
+COPY kotaemon/app.py /app/app.py
+COPY kotaemon_pipeline_scripts/.env /app/.env
+COPY kotaemon_pipeline_scripts/flowsettings.py /app/flowsettings.py
+COPY kotaemon_pipeline_scripts /app/pipeline_scripts
 COPY taxonomy /app/taxonomy
 
 
 
@@ -13,36 +13,162 @@ rag_system
 ├── flowsettings.py
 ├── kotaemon
 ├── kotaemon_install_guide
-├── pipeline_scripts  
+├── kotaemon_pipeline_scripts
+├── new_pipeline_scripts
 ├── README.md
 └── taxonomy
 ```
 
-## Pipeline Scripts Instructions
+There are 2 pipeline ingestion projects here...
+... that share the same taxonomy.
+
+The first one (with Kotaemon) use these folders :
+
+```bash
+├── docker-compose.yml
+├── Dockerfile
+├── kotaemon
+├── kotaemon_install_guide
+├── kotaemon_pipeline_scripts
+└── taxonomy  
+```
+
+The second one (currently without Kotaemon ?) use these folders:
+
+```bash
+├── new_pipeline_scripts
+├── README.md
+└── taxonomy
+```
+
+## NEW Pipeline Scripts Instructions
 
 The pipeline scripts folder contains script to make extraction and analysis of documents.
 
 To setup the pipeline scripts, run the following command:
 
 
 ```bash
-cd rag_system/pipeline_scripts
+cd rag_system/new_pipeline_scripts
 uv sync
 ```
 
 You can find a detailed guide here: [📄](../rag_system/pipeline_scripts/agentic_data_policies_extraction/policies_transformation_to_matrices/README.md)
 
 
-## Running the RAG System
+### Running the RAG System
 
 We recommend running as a Python module, or using the Docker Compose file:
 
 ```bash
-cd rag_system/pipeline_scripts
+cd rag_system/new_pipeline_scripts
 uv run python -m  agentic_data_policies_extraction.main
 ```
 
 
+## KOTAEMON Pipeline Scripts Instructions
+
+This framework is build according to Kotaemon to allow a new custom built 'fast' ingestion script (multi-threading ingestion for hundred and hundred document ate the same time), side-to-side to the standard 'drag-and-drop' Kotaemon ingestion from the UI.
+
+Shell scripts call ...
+
+
+
+### DEV set-up deployment
+
+You have two config files to check:
+
+
+#### - the official Kotaemon file 'flowsettings.py" :
+
+This file is at the root of 'rag_system'. (It will overwrite the official 'flowsettings.py' during the docker build.)
+
+where are declared (among other things but the main declared components...):
+
+- ```KH_OLLAMA_URL``` : the uri used to connect to the Ollama service inference (LLM models inference service)
+- ```KH_APP_DATA_DIR``` : The main app data root directory where Kotaemon store all the internal data
+- ```KH_DOCSTORE``` : The Kotaemon Docstore used and the path for it. Local Lancedb by default, but you could choose a remote LanceDB here.
+- ```KH_VECTORSTORE``` : The Kotaemon VectorStore used and the url for it. Qdrant by default for the dev team.
+- ...
+
+You should not touch all these config for now... (for a dev setup)
+
+
+#### - an additionnal .env to set inside the 'kotaemon_pipeline_scripts' folder :
+
+This file lives inside 'kotaemon_pipeline_scripts'.
+
+You have to generate your own .env from the .env.example template.
+
+All these config parameters are needed for the automatic fast ingestion pipeline.
+
+- ```PG_DATABASE_URL```  = The URL of the Data4Good database that maintains the OpenAlex articles metadata 
+- ```LLM_INFERENCE_URL```  = The URL for the LLM inference stack (Ollama for local dev)
+- ```LLM_INFERENCE_MODEL```  = The model used for the chunk inference on metadatas
+- ```LLM_INFERENCE_API_KEY```  = The API Key for the LLM inference stack
+- ```EMBEDDING_MODEL_URL```  = The URL for the LLM embedding model stack (Ollama for local dev)
+- ```EMBEDDING_MODEL```  = The model used for the embedding
+- ```EMBEDDING_MODEL_API_KEY```  = The API Key for the LLM embedding model
+- ```COLLECTION_ID```  = The id of the collection within Kotaemon App (BE CAREFULL TO CHOOSE THE RIGHT ID)
+- ```USER_ID```  = The User ID taken from the Kotaemon App (BE CAREFULL TO CHOOSE THE RIGHT ID)
+
+For now, do not touch the 'USER_ID' before launching the Kotaemon app for the first time. (see further)
+
+
+### Running the RAG System
+
+1) The 'dev' deployment is used to launch, work and debug with the python package in editable mode.
+Moreover, all the 'kotaemon_pipeline_scripts' folder is mapped (as a volume) inside the container, to allow working on it during this dev stage.
+
+First, launch the different services with the docker compose provided in this folder.
+
+Nothing to do — everything’s already set up: the Docker Compose file was created to save you the hassle.
+
+You only need to pay attention, if necessary, to the volume mappings.
+
+And if you don't have anny GPU on your lcal machine and you don't have set-up cuda with docker, remove these line for the Ollama service ;
+
+```yaml
+deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+```
+
+```bash
+docker compose up
+```
+
+Additionally, the command that normally launches the Kotaemon app (./launch.sh) has been deliberately disabled so you can develop on the app — coding the different libraries (kotaemon, ktem, and our custom ones) — without having to stop/restart the Kotaemon container.
+
+Indeed, to run the Kotaemon app for testing, you need to enter the container:
+
+From the rag_system folder where the Docker Compose file is located:
+
+```bash
+docker compose exec -it kotaemon bash
+```
+
+IMPORTANT: After launching the Kotaemon App, go on a random page... see the logs to retrieve the USER ID !
+Shut-down the kotaemon app from inside the container. (or shut-down all the containers if you want)
+Replace the good USER ID within your .env.
+Re-launch the Kotaemon app. Your 'fast' ingestion pipeline scripts should be consistents.
+
+
+2) You also need to pull the different models with the Ollama service.
+Read and follow the point 2 of the README inside the 'kotaemon_install_guide' (FR) relative to this.
+
+3) And now, for your first steps on the Kotaemon app, read and follow the point 3 of of the README inside the 'kotaemon_install_guide' (FR) relative to this.
+
+
+### Running the 'Fast' ingestion pipeline scripts
+
+
+
+
 ## Kotaemon Subtree Setup
 
 The Kotaemon folder is a shared Data4Good subtree, synchronized with the common project:
 
@@ -1,10 +1,10 @@
 services:
   kotaemon:
     build:
-      context: ./kotaemon
-      target: full
+      context: .
+      target: dev-runtime
     pull_policy: if_not_present
-    entrypoint: ["/bin/sh", "-c", "pip install -e /app/taxonomy && tail -f /dev/null"]
+    entrypoint: ["/bin/sh", "-c", "tail -f /dev/null"]
     environment:
       GRADIO_SERVER_NAME: 0.0.0.0
       GRADIO_SERVER_PORT: 7860
@@ -14,10 +14,11 @@ services:
     ports:
       - '7860:7860'
     volumes:
-      - './kotaemon/flowsettings.py:/app/flowsettings.py'
+      - './kotaemon_pipeline_scripts/flowsettings.py:/app/flowsettings.py'
       - './kotaemon/libs:/app/libs'
+      - './kotaemon_pipeline_scripts/.env:/app/.env'
       - './kotaemon/ktem_app_data:/app/ktem_app_data'
-      - './pipeline_scripts/:/app/pipeline_scripts'
+      - './kotaemon_pipeline_scripts/:/app/pipeline_scripts'
       - './taxonomy/:/app/taxonomy'
     depends_on:
       - ollama
 
@@ -82,7 +82,7 @@
     config("KH_FEATURE_USER_MANAGEMENT_PASSWORD", default="admin")
 )
 KH_ENABLE_ALEMBIC = False
-KH_DATABASE = os.getenv("POSTGRESQL_ADDON_URI", None)  # f"sqlite:///{KH_USER_DATA_DIR / 'sql.db'}"
+KH_DATABASE = f"sqlite:///{KH_USER_DATA_DIR / 'sql.db'}"
 # KH_DATABASE = "postgresql://postgres:my_pass@postgres-db:5432/my_db"
 KH_FILESTORAGE_PATH = str(KH_USER_DATA_DIR / "files")
 
 
@@ -5,10 +5,24 @@
 from kotaemon.base import Document
 
 from .base import BaseDocumentStore
+
+""" # Data4Good config - removed for dev setup
 CELLAR_ADDON_KEY_ID = os.getenv("CELLAR_ADDON_KEY_ID", "")
 CELLAR_ADDON_KEY_SECRET = os.getenv("CELLAR_ADDON_KEY_SECRET", "")
 CELLAR_ADDON_HOST = os.getenv("CELLAR_ADDON_HOST", "cellar-c2.services.clever-cloud.com")
 
+# And add this in the __init__ method:
+        self.db_connection = lancedb.connect(
+            "s3://wsl-docstore-prod",
+            storage_options={
+                "region": "us-east-1",
+                "aws_access_key_id": CELLAR_ADDON_KEY_ID,
+                "aws_secret_access_key": CELLAR_ADDON_KEY_SECRET,
+                "endpoint": f"http://{CELLAR_ADDON_HOST}",
+                "allow_http": "true"
+            }
+"""
+
 MAX_DOCS_TO_GET = 10**4
 
 
@@ -25,16 +39,7 @@ def __init__(self, path: str = "lancedb", collection_name: str = "docstore"):
 
         self.db_uri = path
         self.collection_name = collection_name
-        self.db_connection = lancedb.connect(
-            "s3://wsl-docstore-prod",
-            storage_options={
-                "region": "us-east-1",
-                "aws_access_key_id": CELLAR_ADDON_KEY_ID,
-                "aws_secret_access_key": CELLAR_ADDON_KEY_SECRET,
-                "endpoint": f"http://{CELLAR_ADDON_HOST}",
-                "allow_http": "true"
-            }
-        )
+        self.db_connection = lancedb.connect(self.db_uri)  # type: ignore
 
     def add(
         self,
@@ -51,7 +56,7 @@ def add(
                 "text": doc.text,
                 "attributes": json.dumps(doc.metadata),
             }
-            for doc_id, doc in zip(doc_ids, docs, strict=False)
+            for doc_id, doc in zip(doc_ids, docs)
         ]
 
         if self.collection_name not in self.db_connection.table_names():
@@ -126,14 +131,18 @@ def get(self, ids: Union[List[str], str]) -> List[Document]:
             )
         except (ValueError, FileNotFoundError):
             docs = []
-        return [
-            Document(
+
+        # return the documents using the order of original
+        # ids (which were ordered by score)
+        doc_dict = {
+            doc["id"]: Document(
                 id_=doc["id"],
                 text=doc["text"] if doc["text"] else "<empty>",
                 metadata=json.loads(doc["attributes"]),
             )
             for doc in docs
-        ]
+        }
+        return [doc_dict[_id] for _id in ids if _id in doc_dict]
 
     def delete(self, ids: Union[List[str], str], refresh_indices: bool = True):
         """Delete document by id"""
 
@@ -1,11 +1,19 @@
-import os
 from typing import Any, List, Optional, cast
 
 from .base import LlamaIndexVectorStore
 
+""" Data4Good config - removed for dev setup - please, try to add this on your settings and not here...
 VECTORSTORE_URL = os.getenv("VECTOSTORE_URL", "")
 default_api_key = os.getenv("API_KEY", "")
 
+#And add this in the __init__ method: (but please, try to add this on your settings and not here...)
+
+        self._url = VECTORSTORE_URL
+        self._api_key = default_api_key
+
+"""
+
+
 class QdrantVectorStore(LlamaIndexVectorStore):
     _li_class = None
 
@@ -31,16 +39,15 @@ def __init__(
         **kwargs: Any,
     ):
         self._collection_name = collection_name
-        self._url = VECTORSTORE_URL
-        self._api_key = default_api_key
+        self._url = url
+        self._api_key = api_key
         self._client_kwargs = client_kwargs
         self._kwargs = kwargs
-        print(f"url: {self._url}")
 
         super().__init__(
             collection_name=collection_name,
-            url=VECTORSTORE_URL,
-            api_key=default_api_key,
+            url=url,
+            api_key=api_key,
             client_kwargs=client_kwargs,
             **kwargs,
         )
 
@@ -6,7 +6,7 @@
 from ktem.components import filestorage_path, get_docstore, get_vectorstore
 from ktem.db.engine import engine
 from ktem.index.base import BaseIndex
-from sqlalchemy import JSON, Column, DateTime, Integer, String
+from sqlalchemy import JSON, Column, DateTime, Integer, String, UniqueConstraint
 from sqlalchemy.ext.declarative import declarative_base
 from sqlalchemy.ext.mutable import MutableDict
 from theflow.settings import settings as flowsettings
 
@@ -154,7 +154,7 @@ def run(
         # do first round top_k extension
         retrieval_kwargs["do_extend"] = True
         retrieval_kwargs["scope"] = chunk_ids
-        retrieval_kwargs["filters"] = MetadataFilters(
+        """retrieval_kwargs["filters"] = MetadataFilters(
             filters=[
                 MetadataFilter(
                     key="file_id",
@@ -163,7 +163,7 @@ def run(
                 )
             ],
             condition=FilterCondition.OR,
-        )
+        )"""
 
         if self.mmr:
             # TODO: double check that llama-index MMR works correctly
@@ -173,6 +173,10 @@ def run(
         # rerank
         s_time = time.time()
         print(f"retrieval_kwargs: {retrieval_kwargs.keys()}")
+
+        import pdb
+        pdb.set_trace()
+
         docs = self.vector_retrieval(text=text, top_k=self.top_k, **retrieval_kwargs)
         print("retrieval step took", time.time() - s_time)
 
 
@@ -0,0 +1 @@
+.env
Original file line number	Diff line number	Diff line change
`@@ -82,7 +82,7 @@`
`82`	`82`	`config("KH_FEATURE_USER_MANAGEMENT_PASSWORD", default="admin")`
`83`	`83`	`)`
`84`	`84`	`KH_ENABLE_ALEMBIC = False`
`85`		`-KH_DATABASE = os.getenv("POSTGRESQL_ADDON_URI", None) # f"sqlite:///{KH_USER_DATA_DIR / 'sql.db'}"`
	`85`	`+KH_DATABASE = f"sqlite:///{KH_USER_DATA_DIR / 'sql.db'}"`
`86`	`86`	`# KH_DATABASE = "postgresql://postgres:my_pass@postgres-db:5432/my_db"`
`87`	`87`	`KH_FILESTORAGE_PATH = str(KH_USER_DATA_DIR / "files")`
`88`	`88`