Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
7d47de5
commit
Matthew2357 May 1, 2025
3b2ae66
Merge branch 'swiss-ai:master' into master
Matthew2357 May 1, 2025
dfa1b87
html processor
Matthew2357 May 5, 2025
93f9d09
Merge branch 'master' into master
fabnemEPFL May 16, 2025
db6918a
commit for github
Matthew2357 May 18, 2025
7ab5872
Merge branch 'master' of https://github.com/Matthew2357/mmore
Matthew2357 May 18, 2025
d62a025
asdfadsf
Matthew2357 May 18, 2025
cd6c712
Merge branch 'swiss-ai:master' into master
Matthew2357 Jun 2, 2025
e67fabd
commit
Matthew2357 Jun 10, 2025
c679fbe
commit
Matthew2357 Jun 11, 2025
2536419
commit
Matthew2357 Jun 11, 2025
f9f9cba
commit
Matthew2357 Jun 11, 2025
6554950
commit
Matthew2357 Jun 13, 2025
fcb5e35
oops I forgot to include everything in the last commit
Matthew2357 Jun 13, 2025
228ab64
update to the pyproject.toml for langchain_community
Matthew2357 Jun 13, 2025
3f86165
commit to make git happy
Matthew2357 Jun 23, 2025
ea6f835
committtt
Matthew2357 Jun 26, 2025
5a1eb02
ahhh
Matthew2357 Jun 26, 2025
e03991e
commit before PR
Matthew2357 Jun 26, 2025
df5d2da
fixes to RAG CLI
Matthew2357 Jun 27, 2025
6212e73
integrate webrag to new version or retriever
Matthew2357 Jun 27, 2025
33130ce
small fixes
Matthew2357 Jun 28, 2025
441ec69
fixes
Matthew2357 Jun 28, 2025
469131a
rename ragcli.py
Matthew2357 Jun 28, 2025
536143f
Fixed indentation in run_ragcli.py
fabnemEPFL Jun 30, 2025
9739006
Merge branch 'master' into RagCLi
fabnemEPFL Jun 30, 2025
9b72272
cosmetic changes
Matthew2357 Jun 30, 2025
106bbd8
Merge branch 'RagCLi' of https://github.com/Matthew2357/mmore into Ra…
Matthew2357 Jun 30, 2025
6a23714
reformatting
fabnemEPFL Jul 2, 2025
373de55
small fixes and documentation
Matthew2357 Jul 8, 2025
6f374ce
Merge branch 'RagCLi' of https://github.com/Matthew2357/mmore into Ra…
Matthew2357 Jul 8, 2025
6b5090c
ruff formatting and small fixes
Matthew2357 Jul 13, 2025
5363c72
Fix inconsistency in pyproject.toml
fabnemEPFL Aug 8, 2025
9e7879a
Merge branch 'master' into RagCLi
fabnemEPFL Aug 8, 2025
992a5db
Fixed typo in rag.md
fabnemEPFL Aug 8, 2025
f170252
Change the default output path in cc/process_config.yaml
fabnemEPFL Aug 8, 2025
c5cd817
Removed commented code in run_ragcli.py
fabnemEPFL Aug 8, 2025
05f7399
Formatting
fabnemEPFL Aug 8, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions docs/rag.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,17 @@ Our RAG pipeline is built to take full advantage of [LangChain](https://python.l

Our retriever is a LangChain [`BaseRetriever`](https://python.langchain.com/api_reference/core/retrievers/langchain_core.retrievers.BaseRetriever.html). If you want to create a custom retriever (e.g. GraphRetriever,...) you can simply make it inherit from this class and use it as described in our examples.

#### WebRAG (only in local mode at the moment)
When doing RAG in local mode, one can use WebRAG - the [`DuckDuckGo Search API`](https://python.langchain.com/docs/integrations/tools/ddg/) is used to search the web using the query and adds its results to the context.

#### CLI for RAG (only in local mode at the moment)
A user-friendly CLI for RAG. Start your RAG CLI using the `run_ragcli.py` script and your config file
```bash
python3 -m mmore ragcli --config_file /path/to/config.yaml
```

You can customize the CLI by defining [a RAG configuration file](/examples/rag/config.yaml) or by setting preferences from within the CLI.

#### LLM

Our LLMs are LangChain's [`BaseChatModel`](https://python.langchain.com/api_reference/core/retrievers/langchain_core.retrievers.BaseRetriever.html) base class. If you want to create a custom retriever you can simply make it inherit from this class and use it as described in our examples.
Expand Down
2 changes: 1 addition & 1 deletion examples/cc/process_config.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
dispatcher_config:
output_path: /mnt/mlo/scratch/homes/mmore/datasets/cc_sample/mmore_output_fast/
output_path: examples/mmore_output_fast/
use_fast_processors: true
distributed: false
extract_images: true
Expand Down
1 change: 1 addition & 0 deletions examples/rag/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ rag:
name: 'my_db'
hybrid_search_weight: 0.5
k: 5
use_web: true
system_prompt: "Use the following context to answer the questions.\n\nContext:\n{context}"
mode: local
mode_args:
Expand Down
5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -78,12 +78,13 @@ dependencies = [
"langchain-anthropic==0.3.4",
"langchain-aws==0.2.22",
"langchain-cohere==0.4.2",
"langchain_community==0.3.25",
"langchain-huggingface==0.1.2",
"langchain-milvus==0.1.8",
"langchain-mistralai==0.2.7",
"langchain-nvidia-ai-endpoints",
"langchain-openai==0.3.7",
"langchain==0.3.20",
"langchain==0.3.25",
"langserve[all]==0.3.1",
"markdownify==0.13.1",
"ragas==0.2.6",
Expand Down Expand Up @@ -118,7 +119,7 @@ rag = [
"langchain-mistralai==0.2.7",
"langchain-nvidia-ai-endpoints",
"langchain-openai==0.3.7",
"langchain==0.3.20",
"langchain==0.3.25",
"langdetect>=1.0.9",
"langserve[all]==0.3.1",
"pymilvus==2.5.0",
Expand Down
19 changes: 19 additions & 0 deletions src/mmore/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,5 +253,24 @@ def dashboard_backend(host, port):
run_api(host, port)


@main.command()
@click.option(
"--config-file", type=str, required=True, help="Configuration for the RAG CLI."
)
def ragcli(config_file: str):
"""Run the RAG CLI.

Args:
config_file: Configuration.

Returns:

"""
from .run_ragcli import RagCLI

my_rag_cli = RagCLI(config_file)
my_rag_cli.launch_cli()


if __name__ == "__main__":
main()
44 changes: 40 additions & 4 deletions src/mmore/rag/retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from dataclasses import dataclass, field
from typing import Any, Dict, List, Literal, Optional, Tuple, cast, get_args

from langchain_community.utilities import DuckDuckGoSearchAPIWrapper
from langchain_core.callbacks import CallbackManagerForRetrieverRun
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
Expand All @@ -28,6 +29,7 @@ class RetrieverConfig:
hybrid_search_weight: float = 0.5
k: int = 1
collection_name: str = "my_docs"
use_web: bool = False


class Retriever(BaseRetriever):
Expand All @@ -38,6 +40,7 @@ class Retriever(BaseRetriever):
client: MilvusClient
hybrid_search_weight: float
k: int
use_web: bool

_search_types = Literal["dense", "sparse", "hybrid"]

Expand Down Expand Up @@ -78,6 +81,7 @@ def from_config(cls, config: str | RetrieverConfig):
client=client,
hybrid_search_weight=config.hybrid_search_weight,
k=config.k,
use_web=config.use_web,
)

def compute_query_embeddings(
Expand Down Expand Up @@ -271,18 +275,50 @@ def _get_relevant_documents(
document_ids=document_ids,
)

def parse_result(result: Dict[str, Any], i: int) -> Document:
def parse_result(result: Dict[str, Any], i: int, offset: int = 0) -> Document:
return Document(
page_content=result["entity"]["text"],
metadata={
"id": result["id"],
"rank": i + 1,
"rank": offset + i + 1,
"similarity": result["distance"],
},
)

# 0 because there is only one query
return [parse_result(result, i) for i, result in enumerate(results)]
def parse_results(
results: List[Dict[str, Any]], offset: int = 0
) -> List[Document]:
return [parse_result(result, i, offset) for i, result in enumerate(results)]

if self.use_web:
web_docs = self._get_web_documents(query["input"], max_results=self.k)
milvus_docs = parse_results(results, len(web_docs))
return web_docs + milvus_docs
else:
milvus_docs = parse_results(results)
return milvus_docs

def _get_web_documents(self, query: str, max_results: int = 5) -> List[Document]:
"""Fetch additional context from the web via DuckDuckGo."""
logger.info("Performing web search...")
try:
wrapper = DuckDuckGoSearchAPIWrapper()
results = wrapper.results(query, max_results=max_results)
return [
Document(
page_content=result["snippet"],
metadata={
"source": "duckduckgo",
"url": result["link"],
"title": result["title"],
"rank": i + 1,
},
)
for i, result in enumerate(results)
]
except Exception as e:
logger.warning(f"Langchain-DuckDuckGo search failed: {e}")
return []

def get_documents_by_ids(
self, doc_ids: list[str], collection_name: str = "my_docs"
Expand Down
229 changes: 229 additions & 0 deletions src/mmore/run_ragcli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
import argparse
import logging

from huggingface_hub import model_info
from huggingface_hub.utils import HfHubHTTPError

RAG_EMOJI = "🧠🧠🧠🧠🧠"
logger = logging.getLogger(__name__)
logging.basicConfig(
format=f"[RAG {RAG_EMOJI} -- %(asctime)s] %(message)s",
level=logging.INFO,
datefmt="%Y-%m-%d %H:%M:%S",
)

from .rag.pipeline import RAGPipeline
from .run_rag import RAGInferenceConfig
from .utils import load_config


class RagCLI:
def __init__(self, config_file: str):
self.ragConfig = None
self.ragPP = None
self.modified: bool = (
False # flag to indicate if the configuration has been modified
)
self.config_file = config_file

def launch_cli(self):
print_in_color(
"Welcome to this RAG command-line interface! 🧠", "green", bold=True
)
print(
"Available commands are: config, rag, setK, setModel, setWebrag, exit, help. To learn more about usage of a specific command, use the following: \n help <command>"
)
print(
f"Available commands:\n\
{str_green('config')} : see the current config \n\
{str_green('rag')} : enter the RAG CLI \n\
{str_green('setK')} : set the number of documents to retrieve \n\
{str_green('setModel')} : set the model for generation \n\
{str_green('setWebrag')} : decide whether to use web rag \n\
{str_green('help')} : learn more about a command \n\
{str_green('exit')} : exit the CLI"
)
print_in_color(
"To learn more about usage of a specific command, use the following: \n help <command>",
"blue",
bold=True,
)
while True:
try:
cmd = input("> ").strip()
if cmd == "exit":
print("Goodbye!")
break
elif cmd == "help":
print(
"Available commands are: config, rag, setK, setModel, webrag, exit, help. To learn more about usage of a specific command, use the following: \n help <command>"
)
elif cmd.startswith("help "):
command = cmd.split(" ", 1)[1]
if command == "help":
print(
"To see a list of commands, use the command 'help'. To learn more about usage of a specific command, use the following: \n help <command>"
)
elif command == "config":
print("Print the current configuration.")
elif command == "rag":
print("Enter the RAG CLI. Type /bye to exit.")
elif command == "setK":
print(
"Use the command in the following way: 'setK <k>', for a positive integer k. This will set the number of documents to retrieve during RAG."
)
elif command == "setModel":
print(
"Use the command in the following way: 'setModel <model_path>', where model_path is the huggingface path to the model you'd like to use."
)
elif command == "webRag":
print(
"Use the command in the following way: 'webrag <bool>', where bool is either True or False. This will determine if a web search is done during RAG."
)
elif command == "exit":
print("Exit the CLI.")
else:
print("Sorry, this command does not exist.")

elif cmd == "config":
self.init_config()
confrag = self.ragConfig.rag
print(
f"k: {str_in_color(confrag.retriever.k, 'blue')} \nmodel: {str_in_color(confrag.llm.llm_name, 'blue')} \nuse web for rag: {str_in_color(confrag.retriever.use_web, 'blue')}"
)
elif cmd.startswith("greet "):
name = cmd.split(" ", 1)[1]
print(f"Hello, {str_in_color(name, 'yellow', True)}!")
elif cmd.startswith("setK "):
k_str = cmd.split(" ", 1)[1]
try:
k = int(k_str)
if k > 0:
print(k)
self.init_config()
self.ragConfig.rag.retriever.k = k
self.modified = True
else:
print("Please enter a positive integer.")
except ValueError:
print("Invalid input. Please enter a valid integer.")
elif cmd.startswith("setModel "):
new_model = cmd.split(" ", 1)[1]
print(new_model)
valid, message = is_valid_model_path(new_model)
if valid:
print(message)
self.init_config()
self.ragConfig.rag.llm.llm_name = new_model
self.modified = True
else:
print(message)

elif cmd.startswith("setWebrag "):
res = cmd.split(" ", 1)[1].lower()
if res in ["true", "false"]:
self.init_config()
old = self.ragConfig.rag.retriever.use_web
self.ragConfig.rag.retriever.use_web = (
True if res == "true" else False
)
self.modified = (
False
if old == self.ragConfig.rag.retriever.use_web
else True
)
else:
print(
f"Invalid output. Enter {str_in_color('setWebrag True', 'green')} or {str_in_color('setWebrag False', 'red')}."
)

elif cmd == "rag":
self.cli_ception()

else:
print(f"Unknown command: {cmd}")
except (EOFError, KeyboardInterrupt):
print("\nExiting...")
break

def cli_ception(self):
while True:
query = input(str_in_color("rag (type /bye to exit) > ", "red", bold=True))
if query == "/bye":
print_in_color("Exiting the RAG CLI", "red", True)
break
else:
self.init_config()
if self.ragPP is None or self.modified:
self.initialize_ragpp()
self.modified = False
self.do_rag(query)

def init_config(self):
if self.ragConfig is None:
self.ragConfig = load_config(self.config_file, RAGInferenceConfig)

def initialize_ragpp(self):
logger.info("Creating the RAG Pipeline...")
self.ragPP = RAGPipeline.from_config(self.ragConfig.rag)
logger.info("RAG pipeline initialized!")

def do_rag(self, query):
queries = [{"input": query, "collection_name": "my_docs"}]
results = self.ragPP(queries, return_dict=True)

print(query)
print(results[0]["answer"].split("<|end_header_id|>")[-1])
if self.ragConfig.rag.retriever.use_web:
print("\nSources: \n")
for i in range(self.ragConfig.rag.retriever.k):
url = results[0]["docs"][i]["metadata"]["url"]
title = results[0]["docs"][i]["metadata"]["title"]
print(f"{title} : {url}")


def is_valid_model_path(model_path: str):
try:
model_info(model_path)
return True, f"New model set to {str_in_color(model_path, 'blue', True)}"
except HfHubHTTPError as e:
return (
False,
f"{str_in_color('There seems to be an error. Are you sure the model you are asking for exists?', 'red', True)} The error message: {e}",
)


def str_in_color(to_print: str, color: str, bold: bool = False) -> str:
colors = {
"reset": "\033[0m",
"bold": "\033[1m",
"red": "\033[31m",
"green": "\033[32m",
"yellow": "\033[33m",
"blue": "\033[34m",
}
style = colors.get(color, colors["reset"])
if bold:
style = colors["bold"] + style
return f"{style}{to_print}{colors['reset']}"


def print_in_color(to_print: str, color: str, bold: bool = False) -> None:
print(str_in_color(to_print, color, bold))


def str_green(text, bold=False):
return str_in_color(text, "green", bold=bold)


if __name__ == "__main__":
# example usage: python -m mmore.ragcli --config-file examples/rag/config.yaml

parser = argparse.ArgumentParser()
parser.add_argument(
"--config-file", required=True, help="Path to the RAG configuration file."
)
args = parser.parse_args()

my_rag_cli = RagCLI(args.config_file)
my_rag_cli.launch_cli()
Loading