Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# https://docs.github.com/en/code-security/dependabot/working-with-dependabot/dependabot-options-reference#package-ecosystem-
version: 2
updates:
- package-ecosystem: "github-actions"
directory: "/"
schedule:
interval: "monthly"
- package-ecosystem: "pip"
directory: "/"
schedule:
interval: "weekly"
28 changes: 28 additions & 0 deletions .github/workflows/publish.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
name: 📦 Publish Python Package
on:
release:
types: [created]
jobs:
pypi-publish:
name: Publish release to PyPI
runs-on: ubuntu-latest
environment:
name: pypi
url: https://pypi.org/p/mmore
permissions:
id-token: write
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: ["3.10", "3.11", "3.12"]
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install setuptools wheel
- name: Build package
run: |
python setup.py sdist bdist_wheel # Could also be python -m build
- name: Publish package distributions to PyPI
uses: pypa/gh-action-pypi-publish@release/v1
14 changes: 14 additions & 0 deletions .github/workflows/ruff.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
name: 🧹 Ruff linter checks
on:
push:
branches:
- main
pull_request:
jobs:
lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: astral-sh/ruff-action@v3
- run: ruff check
- run: ruff format --check
34 changes: 34 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
name: 🧪 PyTest unit tests

on:
push:
branches:
- main
pull_request:

jobs:
test:
runs-on: ubuntu-latest

strategy:
matrix:
python-version: ["3.10", "3.11", "3.12"]

steps:
- name: Checkout code
uses: actions/checkout@v3

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -e '.[rag,dev]' # or custom setup
pip install pytest # if not in requirements.txt

- name: Run tests
run: |
pytest
40 changes: 28 additions & 12 deletions examples/rag/evaluation/rag_evaluator_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,28 +6,44 @@

load_dotenv()

MOCK_EVALUATOR_CONFIG = './examples/rag/evaluation/rag_eval_example_config.yaml'
MOCK_INDEXER_CONFIG = './examples/rag/evaluation/indexer_eval_example_config.yaml'
MOCK_RAG_CONFIG = './examples/rag/evaluation/rag_evaluated_example_config.yaml'
MOCK_EVALUATOR_CONFIG = "./examples/rag/evaluation/rag_eval_example_config.yaml"
MOCK_INDEXER_CONFIG = "./examples/rag/evaluation/indexer_eval_example_config.yaml"
MOCK_RAG_CONFIG = "./examples/rag/evaluation/rag_evaluated_example_config.yaml"


def get_args():
parser = argparse.ArgumentParser(description='Run RAG Evaluation pipeline with specified parameters or use default mock data')
parser.add_argument('--eval-config', type=str, default=MOCK_EVALUATOR_CONFIG, help='Path to a rag evaluator config file.')
parser.add_argument('--indexer-config', type=str, default=MOCK_INDEXER_CONFIG, help='Path to an Indexer config file.')
parser.add_argument('--rag-config', type=str, default=MOCK_RAG_CONFIG, help='Path to a rag config file.')
parser = argparse.ArgumentParser(
description="Run RAG Evaluation pipeline with specified parameters or use default mock data"
)
parser.add_argument(
"--eval-config",
type=str,
default=MOCK_EVALUATOR_CONFIG,
help="Path to a rag evaluator config file.",
)
parser.add_argument(
"--indexer-config",
type=str,
default=MOCK_INDEXER_CONFIG,
help="Path to an Indexer config file.",
)
parser.add_argument(
"--rag-config",
type=str,
default=MOCK_RAG_CONFIG,
help="Path to a rag config file.",
)

return parser.parse_args()


if __name__ == "__main__":
args = get_args()

# Instantiate RAGEvaluator
evaluator = RAGEvaluator.from_config(args.eval_config)

# Run the evaluation
result = evaluator(
indexer_config = args.indexer_config,
rag_config = args.rag_config
)
result = evaluator(indexer_config=args.indexer_config, rag_config=args.rag_config)

print(result)
print(result)
36 changes: 36 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,27 @@ cu124 = [
"torch>=2.5.1",
]

rag = [
"accelerate",
"langchain-anthropic==0.3.4",
"langchain-aws",
"langchain-cohere==0.4.2",
"langchain-huggingface==0.1.2",
"langchain-milvus==0.1.8",
"langchain-mistralai==0.2.7",
"langchain-nvidia-ai-endpoints",
"langchain-openai==0.3.7",
"langchain==0.3.20",
"langdetect>=1.0.9",
"langserve[all]==0.3.1",
"pymilvus==2.5.0",
"milvus-model==0.2.12",
"ragas==0.2.6",
"nltk>=3.9",
]

dev = ["pytest>=8.0.0", "ruff>=0.4.0"]

[tool.uv]
conflicts = [
[
Expand Down Expand Up @@ -131,3 +152,18 @@ profile = "black"

[project.scripts]
mmore = "mmore.cli:main"

[tool.pytest.ini_options]
filterwarnings = ["ignore::Warning"]
testpaths = ["tests"]

[tool.ruff]
exclude = ["src/mmore/run_retriever.py"] # TODO: add back when GH CI bug is fixed

[tool.ruff.lint]
select = ["E", "F", "W", "I", "N"]
ignore = ["E501"] # Avoid enforcing line-length violations (`E501`)

[tool.ruff.lint.per-file-ignores]
"__init__.py" = ["F401"]
"run_index_api.py" = ["N803", "N806"]
8 changes: 4 additions & 4 deletions scripts/data_extractor.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
import os
import zipfile

import requests

url = "https://corp.digitalcorpora.org/corpora/files/CC-MAIN-2021-31-PDF-UNTRUNCATED/zipfiles/0000-0999/0000.zip"
response = requests.get(url)
with open("0000.zip", "wb") as f:
f.write(response.content)
# Unzip the file
import zipfile

with zipfile.ZipFile("0000.zip", "r") as zip_ref:
zip_ref.extractall("0000")
# Read the PDF files
import os

print(os.listdir("0000"))

Expand All @@ -20,4 +20,4 @@

# Extract 100 files, and copy them in '0000_small' folder
for i in range(100):
os.system(f'cp 0000/{os.listdir("0000")[i]} 0000_small')
os.system(f"cp 0000/{os.listdir('0000')[i]} 0000_small")
27 changes: 21 additions & 6 deletions src/mmore/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,10 @@ def process(config_file: str):
help="Path to the config file for post-processing.",
)
@click.option(
"--input-data", type=str, required=True, help="Path to the input JSONL file of documents."
"--input-data",
type=str,
required=True,
help="Path to the input JSONL file of documents.",
)
def postprocess(config_file: str, input_data: str):
"""Run the post-processors pipeline.
Expand All @@ -59,7 +62,11 @@ def postprocess(config_file: str, input_data: str):
help="Path to the config file for indexing.",
)
@click.option(
"--documents-path", "-f", type=str, required=False, help="Path to the JSONL file of the (post)processed documents."
"--documents-path",
"-f",
type=str,
required=False,
help="Path to the JSONL file of the (post)processed documents.",
)
@click.option(
"--collection-name",
Expand Down Expand Up @@ -93,10 +100,18 @@ def index(config_file: str, documents_path: str, collection_name: str):
help="Dispatcher configuration file path.",
)
@click.option(
"--input-file", "-f", type=str, required=True, help="Path to the JSONL file of the input queries."
"--input-file",
"-f",
type=str,
required=True,
help="Path to the JSONL file of the input queries.",
)
@click.option(
"--output-file", "-o", type=str, required=True, help="Path to which save the results of the retriever as a JSON."
"--output-file",
"-o",
type=str,
required=True,
help="Path to which save the results of the retriever as a JSON.",
)
def retrieve(config_file: str, input_file: str, output_file: str):
"""Retrieve documents for specified queries.
Expand Down Expand Up @@ -171,8 +186,8 @@ def dashboard_backend(host, port):
"""Run the dashboard backend.

Args:
host:
port:
host:
port:

Returns:

Expand Down
12 changes: 6 additions & 6 deletions src/mmore/process/execution_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,15 @@ def initialize(distributed_mode=False, client=None):
"""
if ExecutionState._use_dask is not None:
raise Exception("Execution state already initialized")
assert (
distributed_mode is not None
), "Distributed mode must be set to True or False"
assert distributed_mode is not None, (
"Distributed mode must be set to True or False"
)
ExecutionState._use_dask = distributed_mode

if distributed_mode:
assert (
client is not None
), "You must be in the context of a dask client to use distributed mode"
assert client is not None, (
"You must be in the context of a dask client to use distributed mode"
)
ExecutionState._dask_var = Variable("should_stop_execution", client=client)
ExecutionState._dask_var.set(False)
logger.info("Execution state initialized (distributed mode)")
Expand Down
4 changes: 2 additions & 2 deletions src/mmore/process/post_processor/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def _log_plan(self):
logger.info("-" * 50)
logger.info("PP Pipeline:")
for i, processor in enumerate(self.post_processors):
logger.info(f" > {i+1}. {processor.name}")
logger.info(f" > {i + 1}. {processor.name}")
logger.info("-" * 50)

@classmethod
Expand All @@ -75,7 +75,7 @@ def run(self, samples: List[MultimodalSample]) -> List[MultimodalSample]:
for i, processor in enumerate(self.post_processors):
samples = processor.batch_process(samples)
if self.output_config.save_each_step:
self.save_results(samples, f"{i+1}___{processor.name}.jsonl")
self.save_results(samples, f"{i + 1}___{processor.name}.jsonl")
self.save_results(samples, "final_pp.jsonl")
return samples

Expand Down
4 changes: 2 additions & 2 deletions src/mmore/process/processors/pdf_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,8 +158,8 @@ def _extract_images(pdf_doc, xref) -> Optional[Image.Image]:
if self.config.custom_config.get("extract_images", True):
for img_info in page.get_images(full=False):
image = _extract_images(pdf_doc, img_info[0])
if image and clean_image(
image
if (
image and clean_image(image)
): # clean image filters images below size 512x512 and variance below 100, these are defaults and can be changed
embedded_images.append(image)
all_text.append(self.config.attachment_tag)
Expand Down
4 changes: 3 additions & 1 deletion src/mmore/rag/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,9 @@ def __post_init__(self):
else (
"COHERE"
if self.llm_name in _COHERE_MODELS
else "HF" if self.base_url is None else None
else "HF"
if self.base_url is None
else None
)
)
)
Expand Down
6 changes: 3 additions & 3 deletions src/mmore/rag/retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,9 +104,9 @@ def retrieve(
if k == 0:
return []

assert search_type in get_args(
self._search_types
), f"Invalid search_type: {search_type}. Must be 'dense', 'sparse', or 'hybrid'"
assert search_type in get_args(self._search_types), (
f"Invalid search_type: {search_type}. Must be 'dense', 'sparse', or 'hybrid'"
)
search_weight = self._search_weights.get(search_type, self.hybrid_search_weight)

dense_embedding, sparse_embedding = self.compute_query_embeddings(query)
Expand Down
2 changes: 1 addition & 1 deletion src/mmore/run_dashboard_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from pymongo import DESCENDING
from starlette.middleware.cors import CORSMiddleware

from .dashboard.backend.model import (
from mmore.dashboard.backend.model import (
BatchedReports,
DashboardMetadata,
Progress,
Expand Down
8 changes: 4 additions & 4 deletions src/mmore/run_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@

from dotenv import load_dotenv

from mmore.index.indexer import Indexer, IndexerConfig
from mmore.type import MultimodalSample
from mmore.utils import load_config

logger = logging.getLogger(__name__)
INDEX_EMOJI = "🗂️"
logging.basicConfig(
Expand All @@ -14,10 +18,6 @@
datefmt="%Y-%m-%d %H:%M:%S",
)

from .index.indexer import Indexer, IndexerConfig
from .type import MultimodalSample
from .utils import load_config

load_dotenv()


Expand Down
Loading