feat: correct RAG CLI empty answers and improve UX

JCHAVEROT · JCHAVEROT · commit 56bedef5adde · 2026-06-12T20:31:44.000+02:00
diff --git a/src/mmore/cli.py b/src/mmore/cli.py
@@ -1,3 +1,8 @@
+import os
+
+# To have the fork-handler messages ("FD from fork parent still in poll list")
+os.environ.setdefault("GRPC_VERBOSITY", "ERROR")
+
 from typing import Optional
 
 import click
diff --git a/src/mmore/rag/pipeline.py b/src/mmore/rag/pipeline.py
@@ -11,7 +11,12 @@
 from langchain_core.language_models.chat_models import BaseChatModel
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.prompts import ChatPromptTemplate
-from langchain_core.runnables import Runnable, RunnableLambda, RunnablePassthrough
+from langchain_core.runnables import (
+    Runnable,
+    RunnableConfig,
+    RunnableLambda,
+    RunnablePassthrough,
+)
 
 from ..utils import load_config
 from .judge import JUDGE_OUTPUT_KEYS, JudgeConfig, LLMJudge, retrieve_with_judge
@@ -130,14 +135,17 @@ def retrieval_with_judge(state: Dict[str, Any]) -> Dict[str, Any]:
         return validate_input | core_chain | validate_output
 
     def __call__(
-        self, queries: Dict[str, Any] | List[Dict[str, Any]], return_dict: bool = False
-    ) -> List[Dict[str, str | List[str]]]:
+        self,
+        queries: Dict[str, Any] | List[Dict[str, Any]],
+        return_dict: bool = False,
+        config: Optional[RunnableConfig] = None,
+    ) -> List[Dict[str, Any]]:
         if isinstance(queries, Dict):
             queries_list = [queries]
         else:
             queries_list = queries
 
-        results = self.rag_chain.batch(queries_list)
+        results = self.rag_chain.batch(queries_list, config=config)
 
         if return_dict:
             return results
diff --git a/src/mmore/run_ragcli.py b/src/mmore/run_ragcli.py
@@ -1,9 +1,17 @@
 import argparse
+import itertools
 import logging
-from typing import Optional
+import random
+import sys
+import threading
+import time
+import warnings
+from typing import Any, Dict, List, Optional
 
 from huggingface_hub import model_info
 from huggingface_hub.errors import HfHubHTTPError
+from langchain_core.callbacks import BaseCallbackHandler
+from pymilvus.exceptions import MilvusException
 
 RAG_EMOJI = "🧠🧠🧠🧠🧠"
 logger = logging.getLogger(__name__)
@@ -29,27 +37,22 @@ def __init__(self, config_file: str):
         self.config_file = config_file
 
     def launch_cli(self):
+        quiet_noisy_libs()
         print_in_color(
             "Welcome to this RAG command-line interface! 🧠", "green", bold=True
         )
         print(
-            "Available commands are: config, rag, setK, setModel, setWebrag, exit, help. To learn more about usage of a specific command, use the following: \n help <command>"
+            f"\nPress {str_green('Enter', bold=True)} to start asking questions about your documents.\n"
         )
         print(
-            f"Available commands:\n\
+            f"Other commands:\n\
         {str_green('config')} : see the current config \n\
-        {str_green('rag')} : enter the RAG CLI \n\
         {str_green('setK')} : set the number of documents to retrieve \n\
         {str_green('setModel')} : set the model for generation \n\
         {str_green('setWebrag')} : decide whether to use web rag \n\
-        {str_green('help')} : learn more about a command \n\
+        {str_green('help')} : learn more about a command (help <command>) \n\
         {str_green('exit')} : exit the CLI"
         )
-        print_in_color(
-            "To learn more about usage of a specific command, use the following: \n help <command>",
-            "blue",
-            bold=True,
-        )
         while True:
             try:
                 cmd = input("> ").strip()
@@ -58,7 +61,7 @@ def launch_cli(self):
                     break
                 elif cmd == "help":
                     print(
-                        "Available commands are: config, rag, setK, setModel, webrag, exit, help. To learn more about usage of a specific command, use the following: \n help <command>"
+                        f"Press {str_green('Enter')} (or type rag) to start asking questions about your documents.\nOther commands are: config, setK, setModel, setWebrag, exit, help. To learn more about usage of a specific command, use the following: \n help <command>"
                     )
                 elif cmd.startswith("help "):
                     command = cmd.split(" ", 1)[1]
@@ -69,7 +72,9 @@ def launch_cli(self):
                     elif command == "config":
                         print("Print the current configuration.")
                     elif command == "rag":
-                        print("Enter the RAG CLI. Type /bye to exit.")
+                        print(
+                            "Start a chat session to ask questions about your documents. Type /bye to exit."
+                        )
                     elif command == "setK":
                         print(
                             "Use the command in the following way: 'setK <k>', for a positive integer k. This will set the number of documents to retrieve during RAG."
@@ -147,27 +152,42 @@ def launch_cli(self):
                             f"Invalid output. Enter {str_in_color('setWebrag True', 'green')} or {str_in_color('setWebrag False', 'red')}."
                         )
 
-                elif cmd == "rag":
+                elif cmd in ("", "rag"):
                     self.cli_ception()
 
                 else:
                     print(f"Unknown command: {cmd}")
+                    if " " in cmd or cmd.endswith("?"):
+                        print(
+                            f"Looks like a question! Press {str_green('Enter')} first to start asking questions about your documents."
+                        )
             except (EOFError, KeyboardInterrupt):
                 print("\nExiting...")
                 break
 
     def cli_ception(self):
+        self.init_config()
+        if self.ragPP is None or self.modified:
+            try:
+                with Spinner():
+                    self.initialize_ragpp()
+            except MilvusException as e:
+                print_in_color(f"Failed to open the document database: {e}", "red")
+                print(
+                    f"A previous session may still be holding it. Run {str_green('pkill -f milvus_lite/lib/milvus')} and try again."
+                )
+                return
+            self.modified = False
+            print_in_color("RAG pipeline ready! Ask your questions.", "green")
         while True:
             query = input(str_in_color("rag (type /bye to exit) > ", "red", bold=True))
             if query == "/bye":
                 print_in_color("Exiting the RAG CLI", "red", True)
                 break
             else:
-                self.init_config()
-                if self.ragPP is None or self.modified:
-                    self.initialize_ragpp()
-                    self.modified = False
-                self.do_rag(query)
+                with Spinner():
+                    results, timings = self.do_rag(query)
+                self.print_answer(results, timings)
 
     def init_config(self):
         if self.ragConfig is None:
@@ -181,22 +201,181 @@ def initialize_ragpp(self):
         logger.info("RAG pipeline initialized!")
 
     @profile_function()
-    def do_rag(self, query):
+    def do_rag(self, query) -> tuple[List[Dict[str, Any]], "TimingHandler"]:
         queries = [{"input": query, "collection_name": "my_docs"}]
         # called only after init_config and initialize_ragpp
         assert self.ragConfig is not None
         assert self.ragPP is not None
 
-        results = self.ragPP(queries, return_dict=True)
+        timings = TimingHandler()
+        results = self.ragPP(queries, return_dict=True, config={"callbacks": [timings]})
+        return results, timings
+
+    def print_answer(
+        self, results: List[Dict[str, Any]], timings: Optional["TimingHandler"] = None
+    ) -> None:
+        assert self.ragConfig is not None
 
-        print(query)
-        print(results[0]["answer"][-1].split("<|end_header_id|>")[-1])
+        answer = results[0]["answer"].split("<|end_header_id|>")[-1].strip()
+        print(f"\n{answer}\n")
+        if timings is not None:
+            self._print_metrics(results[0], answer, timings)
         if self.ragConfig.rag.retriever.use_web:
-            print("\nSources: \n")
+            print("Sources:")
             for i in range(self.ragConfig.rag.retriever.k):
                 url = results[0]["docs"][i]["metadata"]["url"]  # pyright: ignore
                 title = results[0]["docs"][i]["metadata"]["title"]  # pyright: ignore
-                print(f"{title} : {url}")
+                print(f"  - {title}: {url}")
+            print()
+
+    def _print_metrics(
+        self, result: Dict[str, Any], answer: str, timings: "TimingHandler"
+    ) -> None:
+        assert self.ragConfig is not None
+        llm = self.ragConfig.rag.llm
+
+        line1 = [f"{llm.llm_name} ({'local' if llm.provider == 'HF' else 'API'})"]
+        if timings.retrieval_time is not None:
+            line1.append(f"retrieval {timings.retrieval_time:.2f}s")
+        if timings.generation_time is not None:
+            line1.append(f"generation {timings.generation_time:.2f}s")
+
+        docs = result.get("docs") or []
+        line2 = [f"{len(docs)} chunks"]
+
+        ctx_tokens = self._count_tokens(result.get("context"))
+        if ctx_tokens is not None:
+            ctx = f"{ctx_tokens / 1000:.1f}k" if ctx_tokens >= 1000 else str(ctx_tokens)
+            line2.append(f"{ctx} context tokens")
+
+        gen_tokens = timings.completion_tokens or self._count_tokens(answer)
+        if gen_tokens:
+            part = f"{gen_tokens} tokens"
+            if timings.generation_time:
+                part += f" @ {gen_tokens / timings.generation_time:.0f} tok/s"
+            line2.append(part)
+
+        scores = [
+            d["metadata"]["similarity"]
+            for d in docs
+            if d.get("metadata", {}).get("similarity") is not None
+        ]
+        if scores:
+            line2.append(f"top score {max(scores):.2f}")
+
+        print(str_in_color(" | ".join(line1), "gray"))
+        print(str_in_color(" | ".join(line2), "gray") + "\n")
+
+    def _count_tokens(self, text: Optional[str]) -> Optional[int]:
+        """Token count using the local model tokenizer, if one is available."""
+        if not text or self.ragPP is None:
+            return None
+        tokenizer = getattr(self.ragPP.llm, "tokenizer", None)
+        if tokenizer is None:
+            return None
+        try:
+            return len(tokenizer.encode(text))
+        except Exception:
+            return None
+
+
+class TimingHandler(BaseCallbackHandler):
+    """Collects retrieval/generation wall times and token usage from callbacks."""
+
+    def __init__(self):
+        self.retrieval_time: Optional[float] = None
+        self.generation_time: Optional[float] = None
+        self.completion_tokens: Optional[int] = None
+        self._starts: Dict[Any, float] = {}
+
+    def on_retriever_start(self, serialized, query, *, run_id, **kwargs):
+        self._starts[run_id] = time.perf_counter()
+
+    def on_retriever_end(self, documents, *, run_id, **kwargs):
+        if run_id in self._starts:
+            self.retrieval_time = time.perf_counter() - self._starts.pop(run_id)
+
+    def on_llm_start(self, serialized, prompts, *, run_id, **kwargs):
+        self._starts[run_id] = time.perf_counter()
+
+    def on_chat_model_start(self, serialized, messages, *, run_id, **kwargs):
+        self._starts[run_id] = time.perf_counter()
+
+    def on_llm_end(self, response, *, run_id, **kwargs):
+        if run_id in self._starts:
+            self.generation_time = time.perf_counter() - self._starts.pop(run_id)
+        self.completion_tokens = _output_tokens(response)
+
+
+def _output_tokens(response) -> Optional[int]:
+    """Generated-token count if the provider reported it (API models do; HF rarely)."""
+    try:
+        usage = response.generations[0][0].message.usage_metadata
+        if usage and usage.get("output_tokens"):
+            return usage["output_tokens"]
+    except (AttributeError, IndexError, TypeError):
+        pass
+    usage = (response.llm_output or {}).get("token_usage", {})
+    return usage.get("completion_tokens") or usage.get("output_tokens")
+
+
+SPINNER_WORDS = [
+    "Thinking",
+    "Pondering",
+    "Discombobulating",
+    "Cooking",
+    "Brewing",
+    "Ruminating",
+    "Rummaging",
+    "Noodling",
+]
+
+
+class Spinner:
+    """Animated status line shown while work happens in the calling thread."""
+
+    def __init__(self):
+        self._stop = threading.Event()
+        self._thread: Optional[threading.Thread] = None
+
+    def __enter__(self):
+        if sys.stdout.isatty():
+            self._thread = threading.Thread(target=self._spin, daemon=True)
+            self._thread.start()
+        return self
+
+    def __exit__(self, *exc):
+        self._stop.set()
+        if self._thread is not None:
+            self._thread.join()
+            sys.stdout.write("\r\033[K")
+            sys.stdout.flush()
+
+    def _spin(self):
+        frames = itertools.cycle("|/-\\")
+        word = random.choice(SPINNER_WORDS)
+        start = word_start = time.monotonic()
+        while not self._stop.is_set():
+            now = time.monotonic()
+            if now - word_start > 3:
+                word = random.choice(SPINNER_WORDS)
+                word_start = now
+            status = f"{next(frames)} {word}... ({int(now - start)}s)"
+            sys.stdout.write(f"\r\033[K{str_in_color(status, 'blue')}")
+            sys.stdout.flush()
+            time.sleep(0.1)
+
+
+def quiet_noisy_libs():
+    """Hide INFO logs, warnings and progress bars so the CLI stays clean."""
+    logging.disable(logging.INFO)
+    warnings.filterwarnings("ignore")
+    try:
+        from transformers.utils import logging as hf_logging
+    except ImportError:
+        return
+    hf_logging.set_verbosity_error()
+    hf_logging.disable_progress_bar()
 
 
 def is_valid_model_path(model_path: str):
@@ -218,6 +397,7 @@ def str_in_color(to_print: str | int, color: str, bold: bool = False) -> str:
         "green": "\033[32m",
         "yellow": "\033[33m",
         "blue": "\033[34m",
+        "gray": "\033[90m",
     }
     style = colors.get(color, colors["reset"])
     if bold:
@@ -234,6 +414,7 @@ def str_green(text, bold=False):
 
 
 if __name__ == "__main__":
+    quiet_noisy_libs()
     enable_profiling_from_env()
     # example usage: python -m mmore.ragcli --config-file examples/rag/config.yaml
 
diff --git a/src/mmore/tui/app.py b/src/mmore/tui/app.py
@@ -145,7 +145,7 @@ def _run_single_command() -> None:
     if spec.needs_input_data:
         input_data = questionary.text(
             "Input JSONL path",
-            default=cwd_default("outputs/process/merged/merged_results.jsonl"),
+            default=cwd_default("examples/process/outputs/merged/merged_results.jsonl"),
             style=QSTYLE,
             qmark=QMARK,
         ).ask()
diff --git a/tests/test_ragcli.py b/tests/test_ragcli.py