swiss-ai · paultltc · Jun 12, 2025 · May 3, 2025 · May 3, 2025 · May 3, 2025
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -0,0 +1,11 @@
+# https://docs.github.com/en/code-security/dependabot/working-with-dependabot/dependabot-options-reference#package-ecosystem-
+version: 2
+updates:
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "monthly"
+  - package-ecosystem: "pip"
+    directory: "/"
+    schedule:
+      interval: "weekly"
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -0,0 +1,28 @@
+name: 📦 Publish Python Package
+on:
+  release:
+    types: [created]
+jobs:
+  pypi-publish:
+    name: Publish release to PyPI
+    runs-on: ubuntu-latest
+    environment:
+      name: pypi
+      url: https://pypi.org/p/mmore
+    permissions:
+      id-token: write
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: ["3.10", "3.11", "3.12"]
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install setuptools wheel
+      - name: Build package
+        run: |
+          python setup.py sdist bdist_wheel  # Could also be python -m build
+      - name: Publish package distributions to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml
@@ -0,0 +1,14 @@
+name: 🧹 Ruff linter checks
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: astral-sh/ruff-action@v3
+      - run: ruff check
+      - run: ruff format --check
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -0,0 +1,34 @@
+name: 🧪 PyTest unit tests
+
+on:
+    push:
+        branches:
+        - main
+    pull_request:
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+
+    strategy:
+      matrix:
+        python-version: ["3.10", "3.11", "3.12"]
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e '.[rag,dev]'  # or custom setup
+          pip install pytest  # if not in requirements.txt
+
+      - name: Run tests
+        run: |
+          pytest
diff --git a/examples/rag/evaluation/rag_evaluator_example.py b/examples/rag/evaluation/rag_evaluator_example.py
@@ -6,28 +6,44 @@
 
 load_dotenv()
 
-MOCK_EVALUATOR_CONFIG = './examples/rag/evaluation/rag_eval_example_config.yaml'
-MOCK_INDEXER_CONFIG = './examples/rag/evaluation/indexer_eval_example_config.yaml'
-MOCK_RAG_CONFIG = './examples/rag/evaluation/rag_evaluated_example_config.yaml'
+MOCK_EVALUATOR_CONFIG = "./examples/rag/evaluation/rag_eval_example_config.yaml"
+MOCK_INDEXER_CONFIG = "./examples/rag/evaluation/indexer_eval_example_config.yaml"
+MOCK_RAG_CONFIG = "./examples/rag/evaluation/rag_evaluated_example_config.yaml"
+
 
 def get_args():
-    parser = argparse.ArgumentParser(description='Run RAG Evaluation pipeline with specified parameters or use default mock data')
-    parser.add_argument('--eval-config', type=str, default=MOCK_EVALUATOR_CONFIG, help='Path to a rag evaluator config file.')
-    parser.add_argument('--indexer-config', type=str, default=MOCK_INDEXER_CONFIG, help='Path to an Indexer config file.')
-    parser.add_argument('--rag-config', type=str, default=MOCK_RAG_CONFIG, help='Path to a rag config file.')
+    parser = argparse.ArgumentParser(
+        description="Run RAG Evaluation pipeline with specified parameters or use default mock data"
+    )
+    parser.add_argument(
+        "--eval-config",
+        type=str,
+        default=MOCK_EVALUATOR_CONFIG,
+        help="Path to a rag evaluator config file.",
+    )
+    parser.add_argument(
+        "--indexer-config",
+        type=str,
+        default=MOCK_INDEXER_CONFIG,
+        help="Path to an Indexer config file.",
+    )
+    parser.add_argument(
+        "--rag-config",
+        type=str,
+        default=MOCK_RAG_CONFIG,
+        help="Path to a rag config file.",
+    )
 
     return parser.parse_args()
 
+
 if __name__ == "__main__":
     args = get_args()
 
     # Instantiate RAGEvaluator
     evaluator = RAGEvaluator.from_config(args.eval_config)
 
     # Run the evaluation
-    result = evaluator(
-        indexer_config = args.indexer_config,
-        rag_config = args.rag_config
-    )
+    result = evaluator(indexer_config=args.indexer_config, rag_config=args.rag_config)
 
-    print(result)
+    print(result)
diff --git a/pyproject.toml b/pyproject.toml
@@ -101,6 +101,27 @@ cu124 = [
     "torch>=2.5.1",
 ]
 
+rag = [
+  "accelerate",
+  "langchain-anthropic==0.3.4",
+  "langchain-aws",
+  "langchain-cohere==0.4.2",
+  "langchain-huggingface==0.1.2",
+  "langchain-milvus==0.1.8",
+  "langchain-mistralai==0.2.7",
+  "langchain-nvidia-ai-endpoints",
+  "langchain-openai==0.3.7",
+  "langchain==0.3.20",
+  "langdetect>=1.0.9",
+  "langserve[all]==0.3.1",
+  "pymilvus==2.5.0",
+  "milvus-model==0.2.12",
+  "ragas==0.2.6",
+  "nltk>=3.9",
+]
+
+dev = ["pytest>=8.0.0", "ruff>=0.4.0"]
+
 [tool.uv]
 conflicts = [
   [
@@ -131,3 +152,18 @@ profile = "black"
 
 [project.scripts]
 mmore = "mmore.cli:main"
+
+[tool.pytest.ini_options]
+filterwarnings = ["ignore::Warning"]
+testpaths = ["tests"]
+
+[tool.ruff]
+exclude = ["src/mmore/run_retriever.py"]    # TODO: add back when GH CI bug is fixed
+
+[tool.ruff.lint]
+select = ["E", "F", "W", "I", "N"]
+ignore = ["E501"]   # Avoid enforcing line-length violations (`E501`)
+
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["F401"]
+"run_index_api.py" = ["N803", "N806"]
diff --git a/scripts/data_extractor.py b/scripts/data_extractor.py
@@ -1,16 +1,16 @@
+import os
+import zipfile
+
 import requests
 
 url = "https://corp.digitalcorpora.org/corpora/files/CC-MAIN-2021-31-PDF-UNTRUNCATED/zipfiles/0000-0999/0000.zip"
 response = requests.get(url)
 with open("0000.zip", "wb") as f:
     f.write(response.content)
 # Unzip the file
-import zipfile
 
 with zipfile.ZipFile("0000.zip", "r") as zip_ref:
     zip_ref.extractall("0000")
-# Read the PDF files
-import os
 
 print(os.listdir("0000"))
 
@@ -20,4 +20,4 @@
 
 # Extract 100 files, and copy them in '0000_small' folder
 for i in range(100):
-    os.system(f'cp 0000/{os.listdir("0000")[i]} 0000_small')
+    os.system(f"cp 0000/{os.listdir('0000')[i]} 0000_small")
diff --git a/src/mmore/cli.py b/src/mmore/cli.py
@@ -33,7 +33,10 @@ def process(config_file: str):
     help="Path to the config file for post-processing.",
 )
 @click.option(
-    "--input-data", type=str, required=True, help="Path to the input JSONL file of documents."
+    "--input-data",
+    type=str,
+    required=True,
+    help="Path to the input JSONL file of documents.",
 )
 def postprocess(config_file: str, input_data: str):
     """Run the post-processors pipeline.
@@ -59,7 +62,11 @@ def postprocess(config_file: str, input_data: str):
     help="Path to the config file for indexing.",
 )
 @click.option(
-    "--documents-path", "-f", type=str, required=False, help="Path to the JSONL file of the (post)processed documents."
+    "--documents-path",
+    "-f",
+    type=str,
+    required=False,
+    help="Path to the JSONL file of the (post)processed documents.",
 )
 @click.option(
     "--collection-name",
@@ -93,10 +100,18 @@ def index(config_file: str, documents_path: str, collection_name: str):
     help="Dispatcher configuration file path.",
 )
 @click.option(
-    "--input-file", "-f", type=str, required=True, help="Path to the JSONL file of the input queries."
+    "--input-file",
+    "-f",
+    type=str,
+    required=True,
+    help="Path to the JSONL file of the input queries.",
 )
 @click.option(
-    "--output-file", "-o", type=str, required=True, help="Path to which save the results of the retriever as a JSON."
+    "--output-file",
+    "-o",
+    type=str,
+    required=True,
+    help="Path to which save the results of the retriever as a JSON.",
 )
 def retrieve(config_file: str, input_file: str, output_file: str):
     """Retrieve documents for specified queries.
@@ -171,8 +186,8 @@ def dashboard_backend(host, port):
     """Run the dashboard backend.
 
     Args:
-      host: 
-      port: 
+      host:
+      port:
 
     Returns:
 

diff --git a/src/mmore/process/execution_state.py b/src/mmore/process/execution_state.py
@@ -31,15 +31,15 @@ def initialize(distributed_mode=False, client=None):
         """
         if ExecutionState._use_dask is not None:
             raise Exception("Execution state already initialized")
-        assert (
-            distributed_mode is not None
-        ), "Distributed mode must be set to True or False"
+        assert distributed_mode is not None, (
+            "Distributed mode must be set to True or False"
+        )
         ExecutionState._use_dask = distributed_mode
 
         if distributed_mode:
-            assert (
-                client is not None
-            ), "You must be in the context of a dask client to use distributed mode"
+            assert client is not None, (
+                "You must be in the context of a dask client to use distributed mode"
+            )
             ExecutionState._dask_var = Variable("should_stop_execution", client=client)
             ExecutionState._dask_var.set(False)
             logger.info("Execution state initialized (distributed mode)")

diff --git a/src/mmore/process/post_processor/pipeline.py b/src/mmore/process/post_processor/pipeline.py
@@ -49,7 +49,7 @@ def _log_plan(self):
         logger.info("-" * 50)
         logger.info("PP Pipeline:")
         for i, processor in enumerate(self.post_processors):
-            logger.info(f"  > {i+1}. {processor.name}")
+            logger.info(f"  > {i + 1}. {processor.name}")
         logger.info("-" * 50)
 
     @classmethod
@@ -75,7 +75,7 @@ def run(self, samples: List[MultimodalSample]) -> List[MultimodalSample]:
         for i, processor in enumerate(self.post_processors):
             samples = processor.batch_process(samples)
             if self.output_config.save_each_step:
-                self.save_results(samples, f"{i+1}___{processor.name}.jsonl")
+                self.save_results(samples, f"{i + 1}___{processor.name}.jsonl")
         self.save_results(samples, "final_pp.jsonl")
         return samples
 

diff --git a/src/mmore/process/processors/pdf_processor.py b/src/mmore/process/processors/pdf_processor.py
@@ -158,8 +158,8 @@ def _extract_images(pdf_doc, xref) -> Optional[Image.Image]:
             if self.config.custom_config.get("extract_images", True):
                 for img_info in page.get_images(full=False):
                     image = _extract_images(pdf_doc, img_info[0])
-                    if image and clean_image(
-                        image
+                    if (
+                        image and clean_image(image)
                     ):  # clean image filters images below size 512x512 and variance below 100, these are defaults and can be changed
                         embedded_images.append(image)
                         all_text.append(self.config.attachment_tag)

diff --git a/src/mmore/rag/llm.py b/src/mmore/rag/llm.py
@@ -74,7 +74,9 @@ def __post_init__(self):
                     else (
                         "COHERE"
                         if self.llm_name in _COHERE_MODELS
-                        else "HF" if self.base_url is None else None
+                        else "HF"
+                        if self.base_url is None
+                        else None
                     )
                 )
             )

diff --git a/src/mmore/rag/retriever.py b/src/mmore/rag/retriever.py
@@ -104,9 +104,9 @@ def retrieve(
         if k == 0:
             return []
 
-        assert search_type in get_args(
-            self._search_types
-        ), f"Invalid search_type: {search_type}. Must be 'dense', 'sparse', or 'hybrid'"
+        assert search_type in get_args(self._search_types), (
+            f"Invalid search_type: {search_type}. Must be 'dense', 'sparse', or 'hybrid'"
+        )
         search_weight = self._search_weights.get(search_type, self.hybrid_search_weight)
 
         dense_embedding, sparse_embedding = self.compute_query_embeddings(query)

diff --git a/src/mmore/run_dashboard_backend.py b/src/mmore/run_dashboard_backend.py
@@ -9,7 +9,7 @@
 from pymongo import DESCENDING
 from starlette.middleware.cors import CORSMiddleware
 
-from .dashboard.backend.model import (
+from mmore.dashboard.backend.model import (
     BatchedReports,
     DashboardMetadata,
     Progress,

diff --git a/src/mmore/run_index.py b/src/mmore/run_index.py
@@ -6,6 +6,10 @@
 
 from dotenv import load_dotenv
 
+from mmore.index.indexer import Indexer, IndexerConfig
+from mmore.type import MultimodalSample
+from mmore.utils import load_config
+
 logger = logging.getLogger(__name__)
 INDEX_EMOJI = "🗂️"
 logging.basicConfig(
@@ -14,10 +18,6 @@
     datefmt="%Y-%m-%d %H:%M:%S",
 )
 
-from .index.indexer import Indexer, IndexerConfig
-from .type import MultimodalSample
-from .utils import load_config
-
 load_dotenv()