From b5181a5263e85bdb60f038e3af164430ae7c889b Mon Sep 17 00:00:00 2001 From: Arthur PERRIN Date: Fri, 8 May 2026 17:50:41 +0200 Subject: [PATCH 01/24] feat(tui): add interactive Terminal UI for mmore commands MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a `mmore tui` command that launches an interactive terminal interface built with `questionary` and `rich`. Lets users pick a pipeline command (process / postprocess / index / rag), reuse or build a config interactively, and run the full pipeline with progress feedback — without having to write YAML configs by hand. - New `src/mmore/tui/` module (app, commands registry, config builder, pipeline runner, theme). - Wires up `tui` as a top-level Click command in `cli.py`. - Adds `questionary>=2.0` and `rich>=13` to core dependencies. --- pyproject.toml | 4 +- src/mmore/cli.py | 8 + src/mmore/tui/__init__.py | 3 + src/mmore/tui/app.py | 121 ++++++++++++ src/mmore/tui/commands.py | 158 ++++++++++++++++ src/mmore/tui/config_builder.py | 314 ++++++++++++++++++++++++++++++++ src/mmore/tui/pipeline.py | 103 +++++++++++ src/mmore/tui/theme.py | 68 +++++++ uv.lock | 28 +++ 9 files changed, 806 insertions(+), 1 deletion(-) create mode 100644 src/mmore/tui/__init__.py create mode 100644 src/mmore/tui/app.py create mode 100644 src/mmore/tui/commands.py create mode 100644 src/mmore/tui/config_builder.py create mode 100644 src/mmore/tui/pipeline.py create mode 100644 src/mmore/tui/theme.py diff --git a/pyproject.toml b/pyproject.toml index a6f63a51..b9428fa9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,7 +40,9 @@ dependencies = [ "python-dotenv>=1.0", "typing_extensions>=4.15.0,<5.0", "PyYAML>=6.0", - "setuptools<81" + "setuptools<81", + "questionary>=2.0", + "rich>=13" ] [project.optional-dependencies] diff --git a/src/mmore/cli.py b/src/mmore/cli.py index ad952f58..7e8e2af2 100644 --- a/src/mmore/cli.py +++ b/src/mmore/cli.py @@ -265,6 +265,14 @@ def ragcli(config_file: str): my_rag_cli.launch_cli() +@main.command() +def tui(): + """Launch the interactive Terminal UI.""" + from .tui import run + + run() + + @main.group() def colpali(): """ColPali pipeline commands for PDF processing, indexing, and retrieval.""" diff --git a/src/mmore/tui/__init__.py b/src/mmore/tui/__init__.py new file mode 100644 index 00000000..3004c7fb --- /dev/null +++ b/src/mmore/tui/__init__.py @@ -0,0 +1,3 @@ +from mmore.tui.app import run + +__all__ = ["run"] diff --git a/src/mmore/tui/app.py b/src/mmore/tui/app.py new file mode 100644 index 00000000..9feddf10 --- /dev/null +++ b/src/mmore/tui/app.py @@ -0,0 +1,121 @@ +"""mmore TUI entry point.""" +from __future__ import annotations + +import time + +import questionary +from questionary import Style +from rich.spinner import Spinner +from rich.live import Live +from rich.text import Text + +from mmore.tui.commands import REGISTRY +from mmore.tui.config_builder import pick_or_build_config +from mmore.tui.pipeline import run_full_pipeline +from mmore.tui.theme import ACCENT, ACCENT2, MUTED, OK, console, section, show_banner + +QSTYLE = Style([ + ("qmark", "fg:#5fd7ff bold"), + ("question", "bold"), + ("answer", "fg:#ff5fd7 bold"), + ("pointer", "fg:#5fd7ff bold"), + ("highlighted", "fg:#5fd7ff bold"), + ("selected", "fg:#ff5fd7"), + ("instruction", "fg:#808080 italic"), +]) + + +def _run_with_spinner(label: str, fn, **kwargs) -> None: + start = time.time() + spinner = Spinner("dots", text=Text(f" {label}…", style=ACCENT)) + with Live(spinner, console=console, refresh_per_second=12, transient=True): + fn(**kwargs) + console.print( + f" [{OK}]✓[/] {label} [dim]({time.time() - start:.1f}s)[/dim]" + ) + + +def _run_single_command() -> None: + choices = [ + questionary.Choice(f"{spec.name:<12} — {spec.description}", value=spec.name) + for spec in REGISTRY.values() + ] + name = questionary.select( + "Pick a command", choices=choices, style=QSTYLE, qmark="▸", + ).ask() + if name is None: + return + spec = REGISTRY[name] + config_file = pick_or_build_config(spec) + kwargs = {"config_file": config_file} + if spec.needs_input_data: + input_data = questionary.text( + "Input JSONL path", + default="examples/process/outputs/merged/merged_results.jsonl", + style=QSTYLE, qmark="▸", + ).ask() + if input_data is None: + return + kwargs["input_data"] = input_data + + console.print() + console.print(section( + f"Running {name}", + Text(f"config: {config_file}", style=MUTED), + style=ACCENT2, + )) + interactive = name in {"ragcli", "retrieve", "rag"} + if interactive: + spec.run(**kwargs) + else: + _run_with_spinner(spec.description, spec.run, **kwargs) + console.print(f"[{OK}]✓ {name} finished[/]") + + +def _chat_only() -> None: + config_file = pick_or_build_config(REGISTRY["ragcli"]) + console.print() + console.print(section("RAG chat", Text(f"config: {config_file}", style=MUTED))) + REGISTRY["ragcli"].run(config_file=config_file) + + +def _main_menu() -> str | None: + return questionary.select( + "What do you want to do?", + choices=[ + questionary.Choice("⚙ Run a single command", value="single"), + questionary.Choice( + "🚀 Run full pipeline (process → postprocess → index)", + value="pipeline", + ), + questionary.Choice("💬 Chat with indexed documents", value="chat"), + questionary.Separator(), + questionary.Choice("✕ Quit", value="quit"), + ], + style=QSTYLE, + qmark="▸", + ).ask() + + +def run() -> None: + console.clear() + show_banner("interactive launcher") + while True: + try: + mode = _main_menu() + if mode in (None, "quit"): + console.print(f"[{ACCENT}]bye![/]") + return + if mode == "single": + _run_single_command() + elif mode == "pipeline": + run_full_pipeline() + elif mode == "chat": + _chat_only() + except KeyboardInterrupt: + console.print(f"\n[{ACCENT2}]interrupted.[/]") + return + except Exception as e: # noqa: BLE001 + console.print(f"[bold red]error:[/] {e}") + if not questionary.confirm("Continue?", default=True, style=QSTYLE).ask(): + return diff --git a/src/mmore/tui/commands.py b/src/mmore/tui/commands.py new file mode 100644 index 00000000..fae5e67f --- /dev/null +++ b/src/mmore/tui/commands.py @@ -0,0 +1,158 @@ +"""Registry of mmore commands callable from the TUI. + +Each entry mirrors a Click command in `mmore.cli` so the TUI is a thin wrapper: +the `run` callable is the same `run_*` function the CLI uses. +""" +from dataclasses import dataclass, field +from typing import Any, Callable, Optional + + +@dataclass +class CommandSpec: + name: str + description: str + example_config: Optional[str] + run: Callable[..., None] + needs_input_data: bool = False + config_globs: list[str] = field(default_factory=list) + # Lazy importer returning the dataclass to validate YAML against. + # Returns None if no validation is wired up for this stage. + config_dataclass: Optional[Callable[[], Any]] = None + + +def _process(config_file: str, **_): + from mmore.run_process import process + process(config_file) + + +def _postprocess(config_file: str, input_data: str, **_): + from mmore.run_postprocess import postprocess + postprocess(config_file, input_data) + + +def _index(config_file: str, documents_path: Optional[str] = None, + collection_name: Optional[str] = None, **_): + from mmore.run_index import index + index(config_file, documents_path, collection_name) + + +def _retrieve(config_file: str, **_): + from mmore.run_retriever import run_api + run_api(config_file, "0.0.0.0", 8001) + + +def _rag(config_file: str, **_): + from mmore.run_rag import rag + rag(config_file) + + +def _ragcli(config_file: str, **_): + from mmore.run_ragcli import RagCLI + RagCLI(config_file).launch_cli() + + +def _websearch(config_file: str, **_): + from mmore.run_websearch import run_websearch + run_websearch(config_file) + + +# Lazy dataclass importers — keeps heavy deps out of TUI startup. +def _dc_process(): + from mmore.run_process import ProcessInference + return ProcessInference + + +def _dc_postprocess(): + from mmore.process.post_processor.pipeline import PPPipelineConfig + return PPPipelineConfig + + +def _dc_index(): + from mmore.run_index import IndexConfig + return IndexConfig + + +def _dc_rag(): + from mmore.run_rag import RAGInferenceConfig + return RAGInferenceConfig + + +REGISTRY: dict[str, CommandSpec] = { + "process": CommandSpec( + name="process", + description="Crawl + extract documents into a JSONL", + example_config="examples/process/config.yaml", + run=_process, + config_globs=[ + "examples/process/**/*.yaml", + "examples/process/**/*.yml", + ], + config_dataclass=_dc_process, + ), + "postprocess": CommandSpec( + name="postprocess", + description="Chunk / clean processed documents", + example_config="examples/postprocessor/config.yaml", + run=_postprocess, + needs_input_data=True, + config_globs=[ + "examples/postprocessor/**/*.yaml", + "examples/postprocessor/**/*.yml", + ], + config_dataclass=_dc_postprocess, + ), + "index": CommandSpec( + name="index", + description="Embed + store documents in Milvus", + example_config="examples/index/config.yaml", + run=_index, + config_globs=[ + "examples/index/**/*.yaml", + "examples/index/**/*.yml", + ], + config_dataclass=_dc_index, + ), + "retrieve": CommandSpec( + name="retrieve", + description="Run retriever API server", + example_config="examples/rag/config.yaml", + run=_retrieve, + config_globs=[ + "examples/rag/**/*.yaml", + "examples/rag/**/*.yml", + ], + config_dataclass=_dc_rag, + ), + "rag": CommandSpec( + name="rag", + description="Run a one-shot RAG pipeline", + example_config="examples/rag/config.yaml", + run=_rag, + config_globs=[ + "examples/rag/**/*.yaml", + "examples/rag/**/*.yml", + ], + config_dataclass=_dc_rag, + ), + "ragcli": CommandSpec( + name="ragcli", + description="Interactive RAG chat", + example_config="examples/rag/config.yaml", + run=_ragcli, + config_globs=[ + "examples/rag/**/*.yaml", + "examples/rag/**/*.yml", + ], + config_dataclass=_dc_rag, + ), + "websearch": CommandSpec( + name="websearch", + description="Web search (+ optional RAG)", + example_config="examples/websearchRAG/config.yaml", + run=_websearch, + config_globs=[ + "examples/websearchRAG/**/*.yaml", + "examples/websearchRAG/**/*.yml", + ], + ), +} diff --git a/src/mmore/tui/config_builder.py b/src/mmore/tui/config_builder.py new file mode 100644 index 00000000..f91cc2fe --- /dev/null +++ b/src/mmore/tui/config_builder.py @@ -0,0 +1,314 @@ +"""Generate YAML config files via guided prompts. + +Templates here mirror the example configs under `examples/`. The user is +asked only for the fields most likely to change between runs; everything else +falls back to the example defaults. The resulting dict is dumped to a YAML +file under `./tui-configs/`. +""" +from __future__ import annotations + +import os +import time +from pathlib import Path +from typing import Any, Optional + +import questionary +import yaml +from questionary import Style +from rich.panel import Panel +from rich.text import Text + +from mmore.tui.commands import CommandSpec + +CONFIG_DIR = Path("./tui-configs") + +QSTYLE = Style([ + ("qmark", "fg:#5fd7ff bold"), + ("question", "bold"), + ("answer", "fg:#ff5fd7 bold"), + ("pointer", "fg:#5fd7ff bold"), + ("highlighted", "fg:#5fd7ff bold"), + ("selected", "fg:#ff5fd7"), + ("instruction", "fg:#808080 italic"), +]) +QMARK = "▸" + + +def _prompt(question: str, default: str = "") -> str: + answer = questionary.text(question, default=default, style=QSTYLE, qmark=QMARK).ask() + if answer is None: + raise KeyboardInterrupt + return answer + + +def _confirm(question: str, default: bool = False) -> bool: + answer = questionary.confirm(question, default=default, style=QSTYLE, qmark=QMARK).ask() + if answer is None: + raise KeyboardInterrupt + return answer + + +def _save(name: str, data: dict[str, Any]) -> str: + CONFIG_DIR.mkdir(parents=True, exist_ok=True) + path = CONFIG_DIR / f"{name}-{int(time.time())}.yaml" + with open(path, "w") as f: + yaml.safe_dump(data, f, sort_keys=False) + return str(path) + + +def build_process_config() -> str: + data_path = _prompt("Data path (folder with documents to process)", "examples/sample_data/") + output_path = _prompt("Output path (where merged_results.jsonl will be written)", + "examples/process/outputs/") + use_fast = _confirm("Use fast (lower-quality) processors?", default=False) + distributed = _confirm("Use distributed processing (Dask)?", default=False) + extract_images = _confirm("Extract images from documents?", default=True) + + cfg = { + "data_path": data_path, + "google_drive_ids": [], + "previous_results": None, + "dispatcher_config": { + "output_path": output_path, + "use_fast_processors": use_fast, + "distributed": distributed, + "extract_images": extract_images, + "scheduler_file": None, + "process_batch_sizes": [ + {"URLProcessor": 40}, + {"DOCXProcessor": 100}, + {"PDFProcessor": 4000}, + {"MediaProcessor": 40}, + {"SpreadsheetProcessor": 100}, + {"TXTProcessor": 100}, + {"PPTXProcessor": 100}, + {"MarkdownProcessor": 100}, + {"EMLProcessor": 100}, + {"HTMLProcessor": 100}, + ], + "processor_config": { + "MediaProcessor": [ + {"normal_model": "openai/whisper-large-v3-turbo"}, + {"fast_model": "openai/whisper-tiny"}, + {"type": "automatic-speech-recognition"}, + {"sample_rate": 10}, + {"batch_size": 4}, + ], + "PDFProcessor": [ + {"PDFTEXT_CPU_WORKERS": 0}, + {"DETECTOR_BATCH_SIZE": 1}, + {"DETECTOR_POSTPROCESSING_CPU_WORKERS": 0}, + {"RECOGNITION_BATCH_SIZE": 1}, + {"OCR_PARALLEL_WORKERS": 0}, + {"TEXIFY_BATCH_SIZE": 1}, + {"LAYOUT_BATCH_SIZE": 1}, + {"ORDER_BATCH_SIZE": 1}, + {"TABLE_REC_BATCH_SIZE": 1}, + ], + }, + }, + } + return _save("process", cfg) + + +def build_postprocess_config() -> str: + strategy = questionary.select( + "Chunking strategy", + choices=["sentence", "token", "word", "semantic"], + default="sentence", + style=QSTYLE, qmark=QMARK, + ).ask() + if strategy is None: + raise KeyboardInterrupt + table_handling = questionary.select( + "Table handling", + choices=["single_row", "multi_rows", "keep_whole", "none"], + default="single_row", + style=QSTYLE, qmark=QMARK, + ).ask() + if table_handling is None: + raise KeyboardInterrupt + output_path = _prompt("Output JSONL path", + "examples/postprocessor/outputs/merged/results.jsonl") + + cfg = { + "previous_results": None, + "pp_modules": [ + {"type": "chunker", "args": { + "chunking_strategy": strategy, + "table_handling": table_handling, + }}, + ], + "output": {"output_path": output_path, "save_each_step": True}, + } + return _save("postprocess", cfg) + + +def build_index_config(documents_path: Optional[str] = None) -> str: + dense = _prompt("Dense embedding model", + "sentence-transformers/all-MiniLM-L6-v2") + sparse = _prompt("Sparse embedding model", "splade") + db_uri = _prompt("DB URI (Milvus Lite file or server URL)", "./proc_demo.db") + db_name = _prompt("DB name", "my_db") + collection = _prompt("Collection name", "my_docs") + docs = documents_path or _prompt( + "Documents JSONL path", + "examples/postprocessor/outputs/merged/results.jsonl", + ) + cfg = { + "indexer": { + "dense_model": {"model_name": dense, "is_multimodal": False}, + "sparse_model": {"model_name": sparse, "is_multimodal": False}, + "db": {"uri": db_uri, "name": db_name}, + }, + "collection_name": collection, + "documents_path": docs, + } + return _save("index", cfg) + + +BUILDERS = { + "process": build_process_config, + "postprocess": build_postprocess_config, + "index": build_index_config, +} + + +def find_yaml_configs(spec: CommandSpec, root: str = ".") -> list[str]: + """Find candidate YAML configs scoped to this stage. + + Includes: + - files matching any of `spec.config_globs` + - previously-generated `tui-configs/-*.yaml` + """ + root_path = Path(root) + matches: list[str] = [] + for pattern in spec.config_globs: + for p in root_path.glob(pattern): + matches.append(str(p)) + # Generated configs from previous TUI runs + generated = root_path / "tui-configs" + if generated.exists(): + for p in sorted(generated.glob(f"{spec.name}-*.yaml")): + matches.append(str(p)) + + seen: set[str] = set() + out: list[str] = [] + for m in matches: + if m not in seen: + seen.add(m) + out.append(m) + return out + + +def _validate_yaml(path: str, spec: CommandSpec) -> Optional[str]: + """Return None on success, an error message string on failure.""" + if spec.config_dataclass is None: + return None + try: + from mmore.utils import load_config + dataclass_cls = spec.config_dataclass() + load_config(path, dataclass_cls) + return None + except Exception as e: # noqa: BLE001 + return f"{type(e).__name__}: {e}" + + +def _show_error_panel(path: str, err: str) -> None: + from mmore.tui.theme import console + console.print(Panel( + Text.assemble( + (f"{path}\n\n", "bold"), + (err, "red"), + ), + title="[bold red]invalid config[/]", + border_style="red", + padding=(1, 2), + )) + + +def _ranked_choices(spec: CommandSpec, candidates: list[str]) -> list[Any]: + """Put `spec.example_config` first as ★ recommended; rest under a separator.""" + choices: list[Any] = [] + rec = spec.example_config + rest = list(candidates) + if rec and rec in rest: + choices.append(questionary.Choice(f"★ {rec} (recommended)", value=rec)) + rest.remove(rec) + elif rec and Path(rec).exists(): + choices.append(questionary.Choice(f"★ {rec} (recommended)", value=rec)) + if rest: + if choices: + choices.append(questionary.Separator("── other configs ──")) + for c in rest: + choices.append(questionary.Choice(c, value=c)) + return choices + + +def pick_or_build_config(spec: CommandSpec, documents_path: Optional[str] = None) -> str: + """Ask the user to either pick an existing YAML or generate one. + + Validates the chosen YAML against the stage's dataclass and re-prompts + on failure rather than letting the run blow up later. + """ + while True: + choice = questionary.select( + f"Config for `{spec.name}`?", + choices=[ + questionary.Choice("📂 Pick existing YAML", value="pick"), + questionary.Choice("✨ Generate new YAML (guided)", value="build"), + questionary.Choice("⌨ Type a path manually", value="manual"), + ], + style=QSTYLE, qmark=QMARK, + ).ask() + if choice is None: + raise KeyboardInterrupt + + path: Optional[str] = None + + if choice == "pick": + candidates = find_yaml_configs(spec) + ranked = _ranked_choices(spec, candidates) + if not ranked: + questionary.print( + f"No YAML configs found for `{spec.name}`, falling back to manual entry.", + style="fg:yellow", + ) + choice = "manual" + else: + picked = questionary.select( + f"Select a config for `{spec.name}`", + choices=ranked, + style=QSTYLE, qmark=QMARK, + ).ask() + if picked is None: + raise KeyboardInterrupt + path = picked + + if choice == "manual": + manual = _prompt("Path to YAML config") + if not os.path.exists(manual): + _show_error_panel(manual, "file not found") + continue + path = manual + + if choice == "build": + builder = BUILDERS.get(spec.name) + if builder is None: + questionary.print( + f"No guided builder for `{spec.name}` — pick an existing YAML.", + style="fg:yellow", + ) + continue + if spec.name == "index": + path = builder(documents_path=documents_path) # type: ignore[call-arg] + else: + path = builder() + + assert path is not None + err = _validate_yaml(path, spec) + if err is None: + return path + _show_error_panel(path, err) + if not _confirm("Try a different config?", default=True): + raise KeyboardInterrupt diff --git a/src/mmore/tui/pipeline.py b/src/mmore/tui/pipeline.py new file mode 100644 index 00000000..4312c3de --- /dev/null +++ b/src/mmore/tui/pipeline.py @@ -0,0 +1,103 @@ +"""Chain process -> postprocess -> index from the TUI.""" +from __future__ import annotations + +import os +import time + +import questionary +import yaml +from rich.spinner import Spinner +from rich.live import Live +from rich.table import Table +from rich.text import Text + +from mmore.tui.commands import REGISTRY +from mmore.tui.config_builder import pick_or_build_config +from mmore.tui.theme import ACCENT, ACCENT2, MUTED, OK, console, section, step_header + + +def _process_output_jsonl(config_path: str) -> str: + with open(config_path) as f: + cfg = yaml.safe_load(f) + out = cfg["dispatcher_config"]["output_path"] + return os.path.join(out, "merged", "merged_results.jsonl") + + +def _postprocess_output_jsonl(config_path: str) -> str: + with open(config_path) as f: + cfg = yaml.safe_load(f) + return cfg["output"]["output_path"] + + +def _run_step(label: str, fn, **kwargs) -> float: + start = time.time() + spinner = Spinner("dots", text=Text(f" {label}…", style=ACCENT)) + with Live(spinner, console=console, refresh_per_second=12, transient=True): + fn(**kwargs) + elapsed = time.time() - start + console.print(f" [{OK}]✓[/] {label} [dim]({elapsed:.1f}s)[/dim]") + return elapsed + + +def _summary_table(rows: list[tuple[str, str, float]]) -> Table: + table = Table( + title="[bold]Pipeline summary[/bold]", + title_style=ACCENT2, + border_style=ACCENT, + header_style=f"bold {ACCENT}", + show_lines=False, + ) + table.add_column("Step", style="bold") + table.add_column("Output", style=MUTED) + table.add_column("Duration", justify="right") + total = 0.0 + for name, out, dur in rows: + table.add_row(name, out, f"{dur:.1f}s") + total += dur + table.add_section() + table.add_row("[bold]Total[/bold]", "", f"[bold]{total:.1f}s[/bold]") + return table + + +def run_full_pipeline() -> None: + console.print() + console.print(section( + "Full pipeline", + Text("process → postprocess → index → (optional) chat", style=ACCENT), + style=ACCENT2, + )) + + rows: list[tuple[str, str, float]] = [] + + # process + step_header(1, 3, "process") + process_cfg = pick_or_build_config(REGISTRY["process"]) + elapsed = _run_step("Crawling + extracting documents", + REGISTRY["process"].run, config_file=process_cfg) + process_jsonl = _process_output_jsonl(process_cfg) + rows.append(("process", process_jsonl, elapsed)) + + # postprocess + step_header(2, 3, "postprocess") + pp_cfg = pick_or_build_config(REGISTRY["postprocess"]) + elapsed = _run_step("Chunking + cleaning", + REGISTRY["postprocess"].run, + config_file=pp_cfg, input_data=process_jsonl) + pp_jsonl = _postprocess_output_jsonl(pp_cfg) + rows.append(("postprocess", pp_jsonl, elapsed)) + + # index + step_header(3, 3, "index") + index_cfg = pick_or_build_config(REGISTRY["index"], documents_path=pp_jsonl) + elapsed = _run_step("Embedding + indexing into Milvus", + REGISTRY["index"].run, + config_file=index_cfg, documents_path=pp_jsonl) + rows.append(("index", "(vector DB)", elapsed)) + + console.print() + console.print(_summary_table(rows)) + console.print() + + if questionary.confirm("Open the RAG chat now?", default=True).ask(): + rag_cfg = pick_or_build_config(REGISTRY["ragcli"]) + REGISTRY["ragcli"].run(config_file=rag_cfg) diff --git a/src/mmore/tui/theme.py b/src/mmore/tui/theme.py new file mode 100644 index 00000000..8d71fec6 --- /dev/null +++ b/src/mmore/tui/theme.py @@ -0,0 +1,68 @@ +"""Shared visuals: banner, palette, panel helpers.""" +from __future__ import annotations + +from rich.align import Align +from rich.console import Console, Group +from rich.panel import Panel +from rich.text import Text + +console = Console() + +# Palette +ACCENT = "bright_cyan" +ACCENT2 = "magenta" +MUTED = "grey58" +OK = "bold green" +WARN = "yellow" +ERR = "bold red" + +BANNER = r""" + + ███╗ ███╗███╗ ███╗ ██████╗ ██████╗ ███████╗ + ████╗ ████║████╗ ████║██╔═══██╗██╔══██╗██╔════╝ + ██╔████╔██║██╔████╔██║██║ ██║██████╔╝█████╗ + ██║╚██╔╝██║██║╚██╔╝██║██║ ██║██╔══██╗██╔══╝ + ██║ ╚═╝ ██║██║ ╚═╝ ██║╚██████╔╝██║ ██║███████╗ + ╚═╝ ╚═╝╚═╝ ╚═╝ ╚═════╝ ╚═╝ ╚═╝╚══════╝ +""" + + +def _gradient(text: str, start: str = "bright_cyan", end: str = "magenta") -> Text: + """Cheap two-color gradient — top half ACCENT, bottom half ACCENT2.""" + lines = text.splitlines() + half = max(1, len(lines) // 2) + out = Text() + for i, line in enumerate(lines): + style = start if i < half else end + out.append(line + "\n", style=style) + return out + + +def show_banner(subtitle: str = "interactive launcher") -> None: + body = Group( + _gradient(BANNER), + Align.center(Text(subtitle, style=f"italic {MUTED}")), + ) + console.print(Panel( + body, + border_style=ACCENT, + padding=(0, 2), + )) + + +def section(title: str, body: str | Text, style: str = ACCENT) -> Panel: + return Panel( + body if isinstance(body, Text) else Text(body), + title=f"[bold]{title}[/bold]", + border_style=style, + padding=(1, 2), + ) + + +def step_header(idx: int, total: int, name: str) -> None: + bar = "─" * 4 + console.print() + console.print( + f"[{ACCENT}]{bar}[/] [bold]Step {idx}/{total}[/bold] " + f"[{ACCENT2}]{name}[/] [{ACCENT}]{bar}[/]" + ) diff --git a/uv.lock b/uv.lock index b0725455..85aeac3d 100644 --- a/uv.lock +++ b/uv.lock @@ -3634,6 +3634,8 @@ dependencies = [ { name = "pydantic" }, { name = "python-dotenv" }, { name = "pyyaml" }, + { name = "questionary" }, + { name = "rich" }, { name = "setuptools" }, { name = "typing-extensions" }, { name = "validators" }, @@ -3866,10 +3868,12 @@ requires-dist = [ { name = "python-dotenv", specifier = ">=1.0" }, { name = "python-pptx", marker = "extra == 'process'" }, { name = "pyyaml", specifier = ">=6.0" }, + { name = "questionary", specifier = ">=2.0" }, { name = "ragas", marker = "extra == 'rag'", specifier = ">=0.2" }, { name = "rarfile", marker = "extra == 'process'", specifier = ">=4.1" }, { name = "requests", marker = "extra == 'api'", specifier = ">=2.31" }, { name = "requests", marker = "extra == 'process'", specifier = ">=2.31" }, + { name = "rich", specifier = ">=13" }, { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.4.0" }, { name = "scipy", marker = "extra == 'index'", specifier = ">=1.8" }, { name = "sentence-transformers", marker = "extra == 'index'" }, @@ -5892,6 +5896,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c1/1b/f7ea6cde25621cd9236541c66ff018f4268012a534ec31032bcb187dc5e7/proglog-0.1.12-py3-none-any.whl", hash = "sha256:ccaafce51e80a81c65dc907a460c07ccb8ec1f78dc660cfd8f9ec3a22f01b84c", size = 6337, upload-time = "2025-05-09T14:36:16.798Z" }, ] +[[package]] +name = "prompt-toolkit" +version = "3.0.52" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "wcwidth" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a1/96/06e01a7b38dce6fe1db213e061a4602dd6032a8a97ef6c1a862537732421/prompt_toolkit-3.0.52.tar.gz", hash = "sha256:28cde192929c8e7321de85de1ddbe736f1375148b02f2e17edd840042b1be855", size = 434198, upload-time = "2025-08-27T15:24:02.057Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/84/03/0d3ce49e2505ae70cf43bc5bb3033955d2fc9f932163e84dc0779cc47f48/prompt_toolkit-3.0.52-py3-none-any.whl", hash = "sha256:9aac639a3bbd33284347de5ad8d68ecc044b91a762dc39b7c21095fcd6a19955", size = 391431, upload-time = "2025-08-27T15:23:59.498Z" }, +] + [[package]] name = "propcache" version = "0.4.1" @@ -6875,6 +6891,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f1/12/de94a39c2ef588c7e6455cfbe7343d3b2dc9d6b6b2f40c4c6565744c873d/pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b", size = 149341, upload-time = "2025-09-25T21:32:56.828Z" }, ] +[[package]] +name = "questionary" +version = "2.1.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "prompt-toolkit" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f6/45/eafb0bba0f9988f6a2520f9ca2df2c82ddfa8d67c95d6625452e97b204a5/questionary-2.1.1.tar.gz", hash = "sha256:3d7e980292bb0107abaa79c68dd3eee3c561b83a0f89ae482860b181c8bd412d", size = 25845, upload-time = "2025-08-28T19:00:20.851Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3c/26/1062c7ec1b053db9e499b4d2d5bc231743201b74051c973dadeac80a8f43/questionary-2.1.1-py3-none-any.whl", hash = "sha256:a51af13f345f1cdea62347589fbb6df3b290306ab8930713bfae4d475a7d4a59", size = 36753, upload-time = "2025-08-28T19:00:19.56Z" }, +] + [[package]] name = "ragas" version = "0.4.3" From d85be08b6c74006e0fb849751ec58fb7746db4f3 Mon Sep 17 00:00:00 2001 From: Arthur PERRIN Date: Sat, 9 May 2026 11:45:49 +0200 Subject: [PATCH 02/24] fix(tui): address PR review feedback - Centralise QSTYLE/QMARK in theme.py (was duplicated in app.py and config_builder.py) - Derive pipeline output paths via load_config + jsonl_path so env-var expansion ($ROOT_OUT_DIR, ...) and the directory-vs-jsonl logic match what the underlying commands actually use - Add paths.py: repo_root() walks up from CWD to find examples/, so the TUI works from any working directory; cwd_default() gives ./data-style defaults instead of repo-relative paths - Replace examples/... defaults in guided prompts with cwd_default() fallbacks so the TUI is sensible from outside the repo - ruff format pass on the tui/ package --- src/mmore/tui/app.py | 55 +++++++------ src/mmore/tui/commands.py | 20 ++++- src/mmore/tui/config_builder.py | 135 ++++++++++++++++++-------------- src/mmore/tui/paths.py | 52 ++++++++++++ src/mmore/tui/pipeline.py | 79 +++++++++++++------ src/mmore/tui/theme.py | 27 +++++-- 6 files changed, 257 insertions(+), 111 deletions(-) create mode 100644 src/mmore/tui/paths.py diff --git a/src/mmore/tui/app.py b/src/mmore/tui/app.py index 9feddf10..734fe6c2 100644 --- a/src/mmore/tui/app.py +++ b/src/mmore/tui/app.py @@ -1,28 +1,29 @@ """mmore TUI entry point.""" + from __future__ import annotations import time import questionary -from questionary import Style -from rich.spinner import Spinner from rich.live import Live +from rich.spinner import Spinner from rich.text import Text from mmore.tui.commands import REGISTRY from mmore.tui.config_builder import pick_or_build_config +from mmore.tui.paths import cwd_default from mmore.tui.pipeline import run_full_pipeline -from mmore.tui.theme import ACCENT, ACCENT2, MUTED, OK, console, section, show_banner - -QSTYLE = Style([ - ("qmark", "fg:#5fd7ff bold"), - ("question", "bold"), - ("answer", "fg:#ff5fd7 bold"), - ("pointer", "fg:#5fd7ff bold"), - ("highlighted", "fg:#5fd7ff bold"), - ("selected", "fg:#ff5fd7"), - ("instruction", "fg:#808080 italic"), -]) +from mmore.tui.theme import ( + ACCENT, + ACCENT2, + MUTED, + OK, + QMARK, + QSTYLE, + console, + section, + show_banner, +) def _run_with_spinner(label: str, fn, **kwargs) -> None: @@ -30,9 +31,7 @@ def _run_with_spinner(label: str, fn, **kwargs) -> None: spinner = Spinner("dots", text=Text(f" {label}…", style=ACCENT)) with Live(spinner, console=console, refresh_per_second=12, transient=True): fn(**kwargs) - console.print( - f" [{OK}]✓[/] {label} [dim]({time.time() - start:.1f}s)[/dim]" - ) + console.print(f" [{OK}]✓[/] {label} [dim]({time.time() - start:.1f}s)[/dim]") def _run_single_command() -> None: @@ -41,7 +40,10 @@ def _run_single_command() -> None: for spec in REGISTRY.values() ] name = questionary.select( - "Pick a command", choices=choices, style=QSTYLE, qmark="▸", + "Pick a command", + choices=choices, + style=QSTYLE, + qmark=QMARK, ).ask() if name is None: return @@ -51,19 +53,22 @@ def _run_single_command() -> None: if spec.needs_input_data: input_data = questionary.text( "Input JSONL path", - default="examples/process/outputs/merged/merged_results.jsonl", - style=QSTYLE, qmark="▸", + default=cwd_default("outputs/process/merged/merged_results.jsonl"), + style=QSTYLE, + qmark=QMARK, ).ask() if input_data is None: return kwargs["input_data"] = input_data console.print() - console.print(section( - f"Running {name}", - Text(f"config: {config_file}", style=MUTED), - style=ACCENT2, - )) + console.print( + section( + f"Running {name}", + Text(f"config: {config_file}", style=MUTED), + style=ACCENT2, + ) + ) interactive = name in {"ragcli", "retrieve", "rag"} if interactive: spec.run(**kwargs) @@ -93,7 +98,7 @@ def _main_menu() -> str | None: questionary.Choice("✕ Quit", value="quit"), ], style=QSTYLE, - qmark="▸", + qmark=QMARK, ).ask() diff --git a/src/mmore/tui/commands.py b/src/mmore/tui/commands.py index fae5e67f..9ab63920 100644 --- a/src/mmore/tui/commands.py +++ b/src/mmore/tui/commands.py @@ -3,6 +3,7 @@ Each entry mirrors a Click command in `mmore.cli` so the TUI is a thin wrapper: the `run` callable is the same `run_*` function the CLI uses. """ + from dataclasses import dataclass, field from typing import Any, Callable, Optional @@ -22,58 +23,73 @@ class CommandSpec: def _process(config_file: str, **_): from mmore.run_process import process + process(config_file) def _postprocess(config_file: str, input_data: str, **_): from mmore.run_postprocess import postprocess + postprocess(config_file, input_data) -def _index(config_file: str, documents_path: Optional[str] = None, - collection_name: Optional[str] = None, **_): +def _index( + config_file: str, + documents_path: Optional[str] = None, + collection_name: Optional[str] = None, + **_, +): from mmore.run_index import index + index(config_file, documents_path, collection_name) def _retrieve(config_file: str, **_): from mmore.run_retriever import run_api + run_api(config_file, "0.0.0.0", 8001) def _rag(config_file: str, **_): from mmore.run_rag import rag + rag(config_file) def _ragcli(config_file: str, **_): from mmore.run_ragcli import RagCLI + RagCLI(config_file).launch_cli() def _websearch(config_file: str, **_): from mmore.run_websearch import run_websearch + run_websearch(config_file) # Lazy dataclass importers — keeps heavy deps out of TUI startup. def _dc_process(): from mmore.run_process import ProcessInference + return ProcessInference def _dc_postprocess(): from mmore.process.post_processor.pipeline import PPPipelineConfig + return PPPipelineConfig def _dc_index(): from mmore.run_index import IndexConfig + return IndexConfig def _dc_rag(): from mmore.run_rag import RAGInferenceConfig + return RAGInferenceConfig diff --git a/src/mmore/tui/config_builder.py b/src/mmore/tui/config_builder.py index f91cc2fe..be31c41d 100644 --- a/src/mmore/tui/config_builder.py +++ b/src/mmore/tui/config_builder.py @@ -5,6 +5,7 @@ falls back to the example defaults. The resulting dict is dumped to a YAML file under `./tui-configs/`. """ + from __future__ import annotations import os @@ -14,35 +15,29 @@ import questionary import yaml -from questionary import Style from rich.panel import Panel from rich.text import Text from mmore.tui.commands import CommandSpec +from mmore.tui.paths import cwd_default, repo_root, resolve_example +from mmore.tui.theme import QMARK, QSTYLE, console CONFIG_DIR = Path("./tui-configs") -QSTYLE = Style([ - ("qmark", "fg:#5fd7ff bold"), - ("question", "bold"), - ("answer", "fg:#ff5fd7 bold"), - ("pointer", "fg:#5fd7ff bold"), - ("highlighted", "fg:#5fd7ff bold"), - ("selected", "fg:#ff5fd7"), - ("instruction", "fg:#808080 italic"), -]) -QMARK = "▸" - def _prompt(question: str, default: str = "") -> str: - answer = questionary.text(question, default=default, style=QSTYLE, qmark=QMARK).ask() + answer = questionary.text( + question, default=default, style=QSTYLE, qmark=QMARK + ).ask() if answer is None: raise KeyboardInterrupt return answer def _confirm(question: str, default: bool = False) -> bool: - answer = questionary.confirm(question, default=default, style=QSTYLE, qmark=QMARK).ask() + answer = questionary.confirm( + question, default=default, style=QSTYLE, qmark=QMARK + ).ask() if answer is None: raise KeyboardInterrupt return answer @@ -57,9 +52,14 @@ def _save(name: str, data: dict[str, Any]) -> str: def build_process_config() -> str: - data_path = _prompt("Data path (folder with documents to process)", "examples/sample_data/") - output_path = _prompt("Output path (where merged_results.jsonl will be written)", - "examples/process/outputs/") + data_path = _prompt( + "Data path (folder with documents to process)", + cwd_default("data"), + ) + output_path = _prompt( + "Output path (where merged_results.jsonl will be written)", + cwd_default("outputs/process"), + ) use_fast = _confirm("Use fast (lower-quality) processors?", default=False) distributed = _confirm("Use distributed processing (Dask)?", default=False) extract_images = _confirm("Extract images from documents?", default=True) @@ -116,7 +116,8 @@ def build_postprocess_config() -> str: "Chunking strategy", choices=["sentence", "token", "word", "semantic"], default="sentence", - style=QSTYLE, qmark=QMARK, + style=QSTYLE, + qmark=QMARK, ).ask() if strategy is None: raise KeyboardInterrupt @@ -124,20 +125,26 @@ def build_postprocess_config() -> str: "Table handling", choices=["single_row", "multi_rows", "keep_whole", "none"], default="single_row", - style=QSTYLE, qmark=QMARK, + style=QSTYLE, + qmark=QMARK, ).ask() if table_handling is None: raise KeyboardInterrupt - output_path = _prompt("Output JSONL path", - "examples/postprocessor/outputs/merged/results.jsonl") + output_path = _prompt( + "Output JSONL path", + cwd_default("outputs/postprocess/results.jsonl"), + ) cfg = { "previous_results": None, "pp_modules": [ - {"type": "chunker", "args": { - "chunking_strategy": strategy, - "table_handling": table_handling, - }}, + { + "type": "chunker", + "args": { + "chunking_strategy": strategy, + "table_handling": table_handling, + }, + }, ], "output": {"output_path": output_path, "save_each_step": True}, } @@ -145,15 +152,16 @@ def build_postprocess_config() -> str: def build_index_config(documents_path: Optional[str] = None) -> str: - dense = _prompt("Dense embedding model", - "sentence-transformers/all-MiniLM-L6-v2") + dense = _prompt("Dense embedding model", "sentence-transformers/all-MiniLM-L6-v2") sparse = _prompt("Sparse embedding model", "splade") - db_uri = _prompt("DB URI (Milvus Lite file or server URL)", "./proc_demo.db") + db_uri = _prompt( + "DB URI (Milvus Lite file or server URL)", cwd_default("proc_demo.db") + ) db_name = _prompt("DB name", "my_db") collection = _prompt("Collection name", "my_docs") docs = documents_path or _prompt( "Documents JSONL path", - "examples/postprocessor/outputs/merged/results.jsonl", + cwd_default("outputs/postprocess/results.jsonl"), ) cfg = { "indexer": { @@ -174,20 +182,20 @@ def build_index_config(documents_path: Optional[str] = None) -> str: } -def find_yaml_configs(spec: CommandSpec, root: str = ".") -> list[str]: +def find_yaml_configs(spec: CommandSpec) -> list[str]: """Find candidate YAML configs scoped to this stage. - Includes: - - files matching any of `spec.config_globs` - - previously-generated `tui-configs/-*.yaml` + Globs are evaluated against the resolved repo root (looked up by walking + up from CWD), so the TUI works from any working directory. Generated + configs in `./tui-configs/` (CWD-relative) are always included so users + keep access to configs they just built. """ - root_path = Path(root) + root = repo_root() or Path.cwd() matches: list[str] = [] for pattern in spec.config_globs: - for p in root_path.glob(pattern): + for p in root.glob(pattern): matches.append(str(p)) - # Generated configs from previous TUI runs - generated = root_path / "tui-configs" + generated = Path.cwd() / "tui-configs" if generated.exists(): for p in sorted(generated.glob(f"{spec.name}-*.yaml")): matches.append(str(p)) @@ -207,6 +215,7 @@ def _validate_yaml(path: str, spec: CommandSpec) -> Optional[str]: return None try: from mmore.utils import load_config + dataclass_cls = spec.config_dataclass() load_config(path, dataclass_cls) return None @@ -215,28 +224,35 @@ def _validate_yaml(path: str, spec: CommandSpec) -> Optional[str]: def _show_error_panel(path: str, err: str) -> None: - from mmore.tui.theme import console - console.print(Panel( - Text.assemble( - (f"{path}\n\n", "bold"), - (err, "red"), - ), - title="[bold red]invalid config[/]", - border_style="red", - padding=(1, 2), - )) + console.print( + Panel( + Text.assemble( + (f"{path}\n\n", "bold"), + (err, "red"), + ), + title="[bold red]invalid config[/]", + border_style="red", + padding=(1, 2), + ) + ) def _ranked_choices(spec: CommandSpec, candidates: list[str]) -> list[Any]: """Put `spec.example_config` first as ★ recommended; rest under a separator.""" choices: list[Any] = [] - rec = spec.example_config + rec_resolved: Optional[str] = None + if spec.example_config: + rec_resolved = resolve_example(spec.example_config) rest = list(candidates) - if rec and rec in rest: - choices.append(questionary.Choice(f"★ {rec} (recommended)", value=rec)) - rest.remove(rec) - elif rec and Path(rec).exists(): - choices.append(questionary.Choice(f"★ {rec} (recommended)", value=rec)) + if rec_resolved and rec_resolved in rest: + choices.append( + questionary.Choice(f"★ {rec_resolved} (recommended)", value=rec_resolved) + ) + rest.remove(rec_resolved) + elif rec_resolved and Path(rec_resolved).exists(): + choices.append( + questionary.Choice(f"★ {rec_resolved} (recommended)", value=rec_resolved) + ) if rest: if choices: choices.append(questionary.Separator("── other configs ──")) @@ -245,7 +261,9 @@ def _ranked_choices(spec: CommandSpec, candidates: list[str]) -> list[Any]: return choices -def pick_or_build_config(spec: CommandSpec, documents_path: Optional[str] = None) -> str: +def pick_or_build_config( + spec: CommandSpec, documents_path: Optional[str] = None +) -> str: """Ask the user to either pick an existing YAML or generate one. Validates the chosen YAML against the stage's dataclass and re-prompts @@ -259,7 +277,8 @@ def pick_or_build_config(spec: CommandSpec, documents_path: Optional[str] = None questionary.Choice("✨ Generate new YAML (guided)", value="build"), questionary.Choice("⌨ Type a path manually", value="manual"), ], - style=QSTYLE, qmark=QMARK, + style=QSTYLE, + qmark=QMARK, ).ask() if choice is None: raise KeyboardInterrupt @@ -271,7 +290,8 @@ def pick_or_build_config(spec: CommandSpec, documents_path: Optional[str] = None ranked = _ranked_choices(spec, candidates) if not ranked: questionary.print( - f"No YAML configs found for `{spec.name}`, falling back to manual entry.", + f"No YAML configs found for `{spec.name}`, " + "falling back to manual entry.", style="fg:yellow", ) choice = "manual" @@ -279,7 +299,8 @@ def pick_or_build_config(spec: CommandSpec, documents_path: Optional[str] = None picked = questionary.select( f"Select a config for `{spec.name}`", choices=ranked, - style=QSTYLE, qmark=QMARK, + style=QSTYLE, + qmark=QMARK, ).ask() if picked is None: raise KeyboardInterrupt diff --git a/src/mmore/tui/paths.py b/src/mmore/tui/paths.py new file mode 100644 index 00000000..17194f00 --- /dev/null +++ b/src/mmore/tui/paths.py @@ -0,0 +1,52 @@ +"""Locate bundled example configs regardless of CWD or install layout. + +Strategy: +- If `examples/` exists relative to CWD (source checkout), use it. +- Else, walk up from CWD looking for a repo root that contains `examples/`. +- Else, fall back to `importlib.resources` to read examples shipped with the + package (only available if the wheel actually bundles them). +- If nothing is found, return the original repo-relative path so error + messages stay readable; callers handle "missing" gracefully. +""" + +from __future__ import annotations + +import os +from functools import lru_cache +from pathlib import Path +from typing import Optional + + +@lru_cache(maxsize=1) +def repo_root() -> Optional[Path]: + """Return a directory that contains an `examples/` folder, if any.""" + cwd = Path.cwd() + for candidate in [cwd, *cwd.parents]: + if (candidate / "examples").is_dir(): + return candidate + return None + + +def resolve_example(rel: str) -> str: + """Resolve an `examples/...` relative path to an absolute one. + + Falls back to the original string if no source checkout is found, so the + UI can still display it (and the validator will surface a clear error). + """ + root = repo_root() + if root is not None: + candidate = root / rel + if candidate.exists(): + return str(candidate) + return rel + + +def resolve_glob(pattern: str) -> tuple[Path, str]: + """Split a relative glob into (root, remaining-pattern) for Path.glob.""" + root = repo_root() or Path.cwd() + return root, pattern + + +def cwd_default(rel: str) -> str: + """A safe default path rooted at CWD (e.g. `./data` instead of `examples/...`).""" + return os.path.join(".", rel) diff --git a/src/mmore/tui/pipeline.py b/src/mmore/tui/pipeline.py index 4312c3de..1114a5ba 100644 --- a/src/mmore/tui/pipeline.py +++ b/src/mmore/tui/pipeline.py @@ -1,32 +1,56 @@ """Chain process -> postprocess -> index from the TUI.""" + from __future__ import annotations import os import time import questionary -import yaml -from rich.spinner import Spinner from rich.live import Live +from rich.spinner import Spinner from rich.table import Table from rich.text import Text from mmore.tui.commands import REGISTRY from mmore.tui.config_builder import pick_or_build_config -from mmore.tui.theme import ACCENT, ACCENT2, MUTED, OK, console, section, step_header +from mmore.tui.theme import ( + ACCENT, + ACCENT2, + MUTED, + OK, + console, + section, + step_header, +) def _process_output_jsonl(config_path: str) -> str: - with open(config_path) as f: - cfg = yaml.safe_load(f) - out = cfg["dispatcher_config"]["output_path"] + """Resolve the JSONL path the `process` step writes to. + + Goes through `mmore.utils.load_config` so env-var expansion ($ROOT_OUT_DIR, + etc.) matches what the underlying command sees. + """ + from mmore.run_process import ProcessInference + from mmore.utils import load_config + + cfg: ProcessInference = load_config(config_path, ProcessInference) + out = cfg.dispatcher_config.output_path return os.path.join(out, "merged", "merged_results.jsonl") def _postprocess_output_jsonl(config_path: str) -> str: - with open(config_path) as f: - cfg = yaml.safe_load(f) - return cfg["output"]["output_path"] + """Resolve the JSONL path `postprocess` writes to. + + Mirrors `PPPipeline`'s use of `mmore.process.utils.jsonl_path`: if the + configured `output_path` is a directory, the pipeline writes to + `/final.jsonl`; if it already ends in `.jsonl`, it's used as-is. + """ + from mmore.process.post_processor.pipeline import PPPipelineConfig + from mmore.process.utils import jsonl_path + from mmore.utils import load_config + + cfg: PPPipelineConfig = load_config(config_path, PPPipelineConfig) + return jsonl_path(cfg.output.output_path) def _run_step(label: str, fn, **kwargs) -> float: @@ -61,37 +85,48 @@ def _summary_table(rows: list[tuple[str, str, float]]) -> Table: def run_full_pipeline() -> None: console.print() - console.print(section( - "Full pipeline", - Text("process → postprocess → index → (optional) chat", style=ACCENT), - style=ACCENT2, - )) + console.print( + section( + "Full pipeline", + Text("process → postprocess → index → (optional) chat", style=ACCENT), + style=ACCENT2, + ) + ) rows: list[tuple[str, str, float]] = [] # process step_header(1, 3, "process") process_cfg = pick_or_build_config(REGISTRY["process"]) - elapsed = _run_step("Crawling + extracting documents", - REGISTRY["process"].run, config_file=process_cfg) + elapsed = _run_step( + "Crawling + extracting documents", + REGISTRY["process"].run, + config_file=process_cfg, + ) process_jsonl = _process_output_jsonl(process_cfg) rows.append(("process", process_jsonl, elapsed)) # postprocess step_header(2, 3, "postprocess") pp_cfg = pick_or_build_config(REGISTRY["postprocess"]) - elapsed = _run_step("Chunking + cleaning", - REGISTRY["postprocess"].run, - config_file=pp_cfg, input_data=process_jsonl) + elapsed = _run_step( + "Chunking + cleaning", + REGISTRY["postprocess"].run, + config_file=pp_cfg, + input_data=process_jsonl, + ) pp_jsonl = _postprocess_output_jsonl(pp_cfg) rows.append(("postprocess", pp_jsonl, elapsed)) # index step_header(3, 3, "index") index_cfg = pick_or_build_config(REGISTRY["index"], documents_path=pp_jsonl) - elapsed = _run_step("Embedding + indexing into Milvus", - REGISTRY["index"].run, - config_file=index_cfg, documents_path=pp_jsonl) + elapsed = _run_step( + "Embedding + indexing into Milvus", + REGISTRY["index"].run, + config_file=index_cfg, + documents_path=pp_jsonl, + ) rows.append(("index", "(vector DB)", elapsed)) console.print() diff --git a/src/mmore/tui/theme.py b/src/mmore/tui/theme.py index 8d71fec6..d719c351 100644 --- a/src/mmore/tui/theme.py +++ b/src/mmore/tui/theme.py @@ -1,6 +1,8 @@ """Shared visuals: banner, palette, panel helpers.""" + from __future__ import annotations +from questionary import Style from rich.align import Align from rich.console import Console, Group from rich.panel import Panel @@ -8,6 +10,19 @@ console = Console() +QSTYLE = Style( + [ + ("qmark", "fg:#5fd7ff bold"), + ("question", "bold"), + ("answer", "fg:#ff5fd7 bold"), + ("pointer", "fg:#5fd7ff bold"), + ("highlighted", "fg:#5fd7ff bold"), + ("selected", "fg:#ff5fd7"), + ("instruction", "fg:#808080 italic"), + ] +) +QMARK = "▸" + # Palette ACCENT = "bright_cyan" ACCENT2 = "magenta" @@ -43,11 +58,13 @@ def show_banner(subtitle: str = "interactive launcher") -> None: _gradient(BANNER), Align.center(Text(subtitle, style=f"italic {MUTED}")), ) - console.print(Panel( - body, - border_style=ACCENT, - padding=(0, 2), - )) + console.print( + Panel( + body, + border_style=ACCENT, + padding=(0, 2), + ) + ) def section(title: str, body: str | Text, style: str = ACCENT) -> Panel: From bbb5d9179ee8f0f6c90fc740dab41ea735d54691 Mon Sep 17 00:00:00 2001 From: Arthur PERRIN Date: Tue, 12 May 2026 08:50:07 +0200 Subject: [PATCH 03/24] feat(tui): full-pipeline wizard, extras detection, cancel-vs-quit - Add a guided wizard ("Build a full pipeline config") that generates coherent process + postprocess + index YAMLs in one flow, exposing only processors / post-processors / indexer types that actually exist in the repo (pulled from ProcessorRegistry, TAGGER_TYPES, FILTER_TYPES). - Detect missing extras per stage via importlib.util.find_spec canaries; disable menu entries and surface the exact `uv sync --extra ...` hint instead of crashing mid-run with ModuleNotFoundError. - Move questionary + rich out of core dependencies into a new `tui` extra (included in `all`); friendly error from `mmore tui` if missing. - Introduce CancelledByUser so Ctrl-C / Esc inside a sub-flow returns to the main menu instead of exiting the whole TUI. Ctrl-C at the main menu still quits. - Add a spinner during YAML validation (dataclass imports take ~5s and made the TUI look frozen). - Document the TUI in the README with install commands and behavior. --- README.md | 18 ++ pyproject.toml | 10 +- src/mmore/cli.py | 10 +- src/mmore/tui/app.py | 137 ++++++++++++-- src/mmore/tui/commands.py | 40 ++++ src/mmore/tui/config_builder.py | 312 ++++++++++++++++++++++++++++++-- src/mmore/tui/exceptions.py | 11 ++ src/mmore/tui/pipeline.py | 55 +++++- uv.lock | 16 +- 9 files changed, 558 insertions(+), 51 deletions(-) create mode 100644 src/mmore/tui/exceptions.py diff --git a/README.md b/README.md index 61c482a6..02a1b4c4 100644 --- a/README.md +++ b/README.md @@ -103,6 +103,24 @@ uv pip install "mmore[process,cpu]" > :warning: **Check the instructions for contributors directly at [`docs/for_devs.md`](./docs/for_devs.md)** +### Interactive TUI + +Prefer a guided experience over editing YAML by hand? Install the `tui` extra and launch the interactive Terminal UI: + +```bash +uv sync --extra tui --extra process --extra index --extra cpu +mmore tui +``` + +From the launcher you can: + +- run any stage (process / postprocess / index / rag / chat) interactively, +- chain the full pipeline (process → postprocess → index → chat), +- generate stage YAML configs through a guided wizard, +- pick from existing example configs without leaving the terminal. + +Generated configs land in `./tui-configs/` and are validated against the stage's dataclass before any run. Stages whose extras are missing are greyed out in the menu with the exact `uv sync --extra ...` command to enable them. Press `Ctrl-C` inside a sub-flow to cancel and return to the main menu; press it at the main menu to quit. + ### Minimal Example You can use our predefined CLI commands to execute parts of the pipeline. Note that you might need to prepend `python -m` to the command if the package does not properly create bash aliases. diff --git a/pyproject.toml b/pyproject.toml index b9428fa9..258d3fd7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,8 +41,6 @@ dependencies = [ "typing_extensions>=4.15.0,<5.0", "PyYAML>=6.0", "setuptools<81", - "questionary>=2.0", - "rich>=13" ] [project.optional-dependencies] @@ -128,8 +126,14 @@ api = [ # --- Composite + variant extras --- +tui = [ + # Interactive terminal launcher (`mmore tui`) + "questionary>=2.0", + "rich>=13", +] + all = [ - "mmore[process,rag,api,websearch]", + "mmore[process,rag,api,websearch,tui]", ] cpu = [ diff --git a/src/mmore/cli.py b/src/mmore/cli.py index 7e8e2af2..080b4be9 100644 --- a/src/mmore/cli.py +++ b/src/mmore/cli.py @@ -268,8 +268,14 @@ def ragcli(config_file: str): @main.command() def tui(): """Launch the interactive Terminal UI.""" - from .tui import run - + try: + from .tui import run + except ImportError as e: + click.echo( + f"TUI dependencies missing ({e.name or e}). " + "Install with: uv sync --extra tui" + ) + raise SystemExit(1) run() diff --git a/src/mmore/tui/app.py b/src/mmore/tui/app.py index 734fe6c2..9f414469 100644 --- a/src/mmore/tui/app.py +++ b/src/mmore/tui/app.py @@ -9,10 +9,16 @@ from rich.spinner import Spinner from rich.text import Text -from mmore.tui.commands import REGISTRY -from mmore.tui.config_builder import pick_or_build_config +from rich.panel import Panel + +from mmore.tui.commands import REGISTRY, check_stage_available +from mmore.tui.config_builder import ( + build_full_pipeline_wizard, + pick_or_build_config, +) +from mmore.tui.exceptions import CancelledByUser from mmore.tui.paths import cwd_default -from mmore.tui.pipeline import run_full_pipeline +from mmore.tui.pipeline import run_full_pipeline, run_pipeline_with_configs from mmore.tui.theme import ( ACCENT, ACCENT2, @@ -26,6 +32,20 @@ ) +def _show_missing_extras(spec_name: str, hint: str) -> None: + console.print( + Panel( + Text.assemble( + (f"Stage `{spec_name}` can't run.\n\n", "bold"), + (hint, "yellow"), + ), + title="[bold yellow]missing dependencies[/]", + border_style="yellow", + padding=(1, 2), + ) + ) + + def _run_with_spinner(label: str, fn, **kwargs) -> None: start = time.time() spinner = Spinner("dots", text=Text(f" {label}…", style=ACCENT)) @@ -35,10 +55,17 @@ def _run_with_spinner(label: str, fn, **kwargs) -> None: def _run_single_command() -> None: - choices = [ - questionary.Choice(f"{spec.name:<12} — {spec.description}", value=spec.name) - for spec in REGISTRY.values() - ] + choices = [] + for spec in REGISTRY.values(): + hint = check_stage_available(spec) + label = f"{spec.name:<12} — {spec.description}" + if hint: + label += " [dim](extras missing)[/dim]" + choices.append( + questionary.Choice(label, value=spec.name, disabled=hint) + ) + else: + choices.append(questionary.Choice(label, value=spec.name)) name = questionary.select( "Pick a command", choices=choices, @@ -48,6 +75,11 @@ def _run_single_command() -> None: if name is None: return spec = REGISTRY[name] + # Defensive re-check in case the user typed past the disabled state. + hint = check_stage_available(spec) + if hint: + _show_missing_extras(spec.name, hint) + return config_file = pick_or_build_config(spec) kwargs = {"config_file": config_file} if spec.needs_input_data: @@ -84,16 +116,68 @@ def _chat_only() -> None: REGISTRY["ragcli"].run(config_file=config_file) +def _run_full_wizard() -> None: + paths = build_full_pipeline_wizard() + console.print() + console.print( + section( + "Wizard complete", + Text( + "process: " + paths["process"] + "\n" + "postprocess: " + paths["postprocess"] + "\n" + "index: " + paths["index"], + style=MUTED, + ), + style=ACCENT2, + ) + ) + if questionary.confirm( + "Run the pipeline now with these configs?", + default=True, + style=QSTYLE, + qmark=QMARK, + ).ask(): + run_pipeline_with_configs(paths["process"], paths["postprocess"], paths["index"]) + + +def _pipeline_hint() -> str | None: + """Return a combined hint if any of process/postprocess/index is missing.""" + hints = [ + check_stage_available(REGISTRY[s]) + for s in ("process", "postprocess", "index") + ] + hints = [h for h in hints if h] + return " | ".join(hints) if hints else None + + def _main_menu() -> str | None: + pipeline_hint = _pipeline_hint() + chat_hint = check_stage_available(REGISTRY["ragcli"]) + + pipeline_choice = questionary.Choice( + "🚀 Run full pipeline (process → postprocess → index)" + + (" [dim](extras missing)[/dim]" if pipeline_hint else ""), + value="pipeline", + disabled=pipeline_hint, + ) + wizard_choice = questionary.Choice( + "🧙 Build a full pipeline config (guided wizard)", + value="wizard", + ) # wizard only writes YAML, no heavy imports needed + chat_choice = questionary.Choice( + "💬 Chat with indexed documents" + + (" [dim](extras missing)[/dim]" if chat_hint else ""), + value="chat", + disabled=chat_hint, + ) + return questionary.select( "What do you want to do?", choices=[ questionary.Choice("⚙ Run a single command", value="single"), - questionary.Choice( - "🚀 Run full pipeline (process → postprocess → index)", - value="pipeline", - ), - questionary.Choice("💬 Chat with indexed documents", value="chat"), + pipeline_choice, + wizard_choice, + chat_choice, questionary.Separator(), questionary.Choice("✕ Quit", value="quit"), ], @@ -106,21 +190,36 @@ def run() -> None: console.clear() show_banner("interactive launcher") while True: + # Ctrl-C at the main menu itself quits; inside any sub-flow it + # cancels and returns here. try: mode = _main_menu() - if mode in (None, "quit"): - console.print(f"[{ACCENT}]bye![/]") - return + except KeyboardInterrupt: + console.print(f"\n[{ACCENT}]bye![/]") + return + if mode in (None, "quit"): + console.print(f"[{ACCENT}]bye![/]") + return + + try: if mode == "single": _run_single_command() elif mode == "pipeline": run_full_pipeline() + elif mode == "wizard": + _run_full_wizard() elif mode == "chat": _chat_only() - except KeyboardInterrupt: - console.print(f"\n[{ACCENT2}]interrupted.[/]") - return + except (CancelledByUser, KeyboardInterrupt): + console.print(f"[{ACCENT2}]cancelled — back to menu.[/]") + continue except Exception as e: # noqa: BLE001 console.print(f"[bold red]error:[/] {e}") - if not questionary.confirm("Continue?", default=True, style=QSTYLE).ask(): + try: + cont = questionary.confirm( + "Continue?", default=True, style=QSTYLE + ).ask() + except KeyboardInterrupt: + return + if not cont: return diff --git a/src/mmore/tui/commands.py b/src/mmore/tui/commands.py index 9ab63920..b498e351 100644 --- a/src/mmore/tui/commands.py +++ b/src/mmore/tui/commands.py @@ -4,6 +4,7 @@ the `run` callable is the same `run_*` function the CLI uses. """ +import importlib.util from dataclasses import dataclass, field from typing import Any, Callable, Optional @@ -19,6 +20,31 @@ class CommandSpec: # Lazy importer returning the dataclass to validate YAML against. # Returns None if no validation is wired up for this stage. config_dataclass: Optional[Callable[[], Any]] = None + # Extras the user has to `uv sync --extra ...` for this stage to import. + # Used only to build a friendly install hint. + required_extras: list[str] = field(default_factory=list) + # Module names probed via `importlib.util.find_spec` to verify the extras + # are actually installed. If any is missing, the stage is disabled in the + # menu with an install hint. + canary_imports: list[str] = field(default_factory=list) + + +def check_stage_available(spec: "CommandSpec") -> Optional[str]: + """Return None if all canary imports resolve, else an install-hint string.""" + missing: list[str] = [] + for mod in spec.canary_imports: + try: + if importlib.util.find_spec(mod) is None: + missing.append(mod) + except (ImportError, ValueError): + missing.append(mod) + if not missing: + return None + extras = " ".join(f"--extra {e}" for e in spec.required_extras) + return ( + f"Missing: {', '.join(missing)}. " + f"Install with: uv sync {extras}".strip() + ) def _process(config_file: str, **_): @@ -104,6 +130,8 @@ def _dc_rag(): "examples/process/**/*.yml", ], config_dataclass=_dc_process, + required_extras=["process", "cpu"], + canary_imports=["torch", "marker", "transformers"], ), "postprocess": CommandSpec( name="postprocess", @@ -116,6 +144,8 @@ def _dc_rag(): "examples/postprocessor/**/*.yml", ], config_dataclass=_dc_postprocess, + required_extras=["process", "cpu"], + canary_imports=["torch", "transformers"], ), "index": CommandSpec( name="index", @@ -127,6 +157,8 @@ def _dc_rag(): "examples/index/**/*.yml", ], config_dataclass=_dc_index, + required_extras=["index", "cpu"], + canary_imports=["pymilvus", "sentence_transformers", "torch"], ), "retrieve": CommandSpec( name="retrieve", @@ -138,6 +170,8 @@ def _dc_rag(): "examples/rag/**/*.yml", ], config_dataclass=_dc_rag, + required_extras=["rag", "api", "cpu"], + canary_imports=["fastapi", "pymilvus", "torch"], ), "rag": CommandSpec( name="rag", @@ -149,6 +183,8 @@ def _dc_rag(): "examples/rag/**/*.yml", ], config_dataclass=_dc_rag, + required_extras=["rag", "cpu"], + canary_imports=["langchain", "pymilvus", "torch"], ), "ragcli": CommandSpec( name="ragcli", @@ -160,6 +196,8 @@ def _dc_rag(): "examples/rag/**/*.yml", ], config_dataclass=_dc_rag, + required_extras=["rag", "cpu"], + canary_imports=["langchain", "pymilvus", "torch"], ), "websearch": CommandSpec( name="websearch", @@ -170,5 +208,7 @@ def _dc_rag(): "examples/websearchRAG/**/*.yaml", "examples/websearchRAG/**/*.yml", ], + required_extras=["websearch"], + canary_imports=["ddgs"], ), } diff --git a/src/mmore/tui/config_builder.py b/src/mmore/tui/config_builder.py index be31c41d..e9b257d6 100644 --- a/src/mmore/tui/config_builder.py +++ b/src/mmore/tui/config_builder.py @@ -15,32 +15,42 @@ import questionary import yaml +from rich.live import Live from rich.panel import Panel +from rich.spinner import Spinner from rich.text import Text from mmore.tui.commands import CommandSpec +from mmore.tui.exceptions import CancelledByUser from mmore.tui.paths import cwd_default, repo_root, resolve_example -from mmore.tui.theme import QMARK, QSTYLE, console +from mmore.tui.theme import ACCENT2, QMARK, QSTYLE, console, section -CONFIG_DIR = Path("./tui-configs") +def _ask(prompt_obj: Any) -> Any: + """Call .ask() and translate Ctrl-C / Esc into CancelledByUser. -def _prompt(question: str, default: str = "") -> str: - answer = questionary.text( - question, default=default, style=QSTYLE, qmark=QMARK - ).ask() + questionary raises KeyboardInterrupt on Ctrl-C and returns None on Esc. + Both should land us back at the main menu, not exit the TUI. + """ + try: + answer = prompt_obj.ask() + except KeyboardInterrupt as e: + raise CancelledByUser("cancelled") from e if answer is None: - raise KeyboardInterrupt + raise CancelledByUser("cancelled") return answer +CONFIG_DIR = Path("./tui-configs") + + +def _prompt(question: str, default: str = "") -> str: + return _ask(questionary.text(question, default=default, style=QSTYLE, qmark=QMARK)) + def _confirm(question: str, default: bool = False) -> bool: - answer = questionary.confirm( - question, default=default, style=QSTYLE, qmark=QMARK - ).ask() - if answer is None: - raise KeyboardInterrupt - return answer + return _ask( + questionary.confirm(question, default=default, style=QSTYLE, qmark=QMARK) + ) def _save(name: str, data: dict[str, Any]) -> str: @@ -120,7 +130,7 @@ def build_postprocess_config() -> str: qmark=QMARK, ).ask() if strategy is None: - raise KeyboardInterrupt + raise CancelledByUser("cancelled") table_handling = questionary.select( "Table handling", choices=["single_row", "multi_rows", "keep_whole", "none"], @@ -129,7 +139,7 @@ def build_postprocess_config() -> str: qmark=QMARK, ).ask() if table_handling is None: - raise KeyboardInterrupt + raise CancelledByUser("cancelled") output_path = _prompt( "Output JSONL path", cwd_default("outputs/postprocess/results.jsonl"), @@ -182,6 +192,259 @@ def build_index_config(documents_path: Optional[str] = None) -> str: } +# Static list of processor class names — kept in sync with +# src/mmore/process/processors/*.py. Used by the full-pipeline wizard so the +# user can pick a subset rather than always shipping all 10. +_ALL_PROCESSORS: list[tuple[str, int]] = [ + ("PDFProcessor", 4000), + ("DOCXProcessor", 100), + ("PPTXProcessor", 100), + ("MarkdownProcessor", 100), + ("HTMLProcessor", 100), + ("TXTProcessor", 100), + ("EMLProcessor", 100), + ("SpreadsheetProcessor", 100), + ("MediaProcessor", 40), + ("URLProcessor", 40), +] + +_PROCESSOR_DEFAULT_CONFIG: dict[str, list[dict[str, Any]]] = { + "MediaProcessor": [ + {"normal_model": "openai/whisper-large-v3-turbo"}, + {"fast_model": "openai/whisper-tiny"}, + {"type": "automatic-speech-recognition"}, + {"sample_rate": 10}, + {"batch_size": 4}, + ], + "PDFProcessor": [ + {"PDFTEXT_CPU_WORKERS": 0}, + {"DETECTOR_BATCH_SIZE": 1}, + {"DETECTOR_POSTPROCESSING_CPU_WORKERS": 0}, + {"RECOGNITION_BATCH_SIZE": 1}, + {"OCR_PARALLEL_WORKERS": 0}, + {"TEXIFY_BATCH_SIZE": 1}, + {"LAYOUT_BATCH_SIZE": 1}, + {"ORDER_BATCH_SIZE": 1}, + {"TABLE_REC_BATCH_SIZE": 1}, + ], +} + + +def build_process_config_wizard() -> str: + """Richer process-config builder that lets the user pick processors.""" + data_path = _prompt( + "Data path (folder with documents to process)", cwd_default("data") + ) + output_path = _prompt( + "Output path (where merged_results.jsonl will be written)", + cwd_default("outputs/process"), + ) + use_fast = _confirm("Use fast (lower-quality) processors?", default=False) + distributed = _confirm("Use distributed processing (Dask)?", default=False) + extract_images = _confirm("Extract images from documents?", default=True) + + names = [n for n, _ in _ALL_PROCESSORS] + selected = questionary.checkbox( + "Select processors to enable", + choices=[questionary.Choice(n, value=n, checked=True) for n in names], + style=QSTYLE, + qmark=QMARK, + ).ask() + if selected is None: + raise CancelledByUser("cancelled") + if not selected: + selected = names # empty would mean a no-op pipeline; fall back to all + + customize = _confirm("Customize batch sizes?", default=False) + sizes: list[dict[str, int]] = [] + for name, default in _ALL_PROCESSORS: + if name not in selected: + continue + if customize: + raw = _prompt(f"Batch size for {name}", str(default)) + try: + value = int(raw) + except ValueError: + value = default + else: + value = default + sizes.append({name: value}) + + processor_config = { + name: cfg + for name, cfg in _PROCESSOR_DEFAULT_CONFIG.items() + if name in selected + } + + cfg = { + "data_path": data_path, + "google_drive_ids": [], + "previous_results": None, + "dispatcher_config": { + "output_path": output_path, + "use_fast_processors": use_fast, + "distributed": distributed, + "extract_images": extract_images, + "scheduler_file": None, + "process_batch_sizes": sizes, + "processor_config": processor_config, + }, + } + return _save("process", cfg) + + +def _postprocessor_choices() -> list[str]: + """Enumerate every post-processor `type` string the loader accepts.""" + from mmore.process.post_processor.filter import FILTER_TYPES + from mmore.process.post_processor.tagger import TAGGER_TYPES + + return ["chunker", "ner", "translator", "metafuse", *TAGGER_TYPES, *FILTER_TYPES] + + +def _ask_module_args(pp_type: str) -> dict[str, Any]: + if pp_type == "chunker": + strategy = questionary.select( + "Chunking strategy", + choices=["sentence", "token", "word", "semantic"], + default="sentence", + style=QSTYLE, + qmark=QMARK, + ).ask() + if strategy is None: + raise CancelledByUser("cancelled") + table_handling = questionary.select( + "Table handling", + choices=["single_row", "multi_rows", "keep_whole", "none"], + default="single_row", + style=QSTYLE, + qmark=QMARK, + ).ask() + if table_handling is None: + raise CancelledByUser("cancelled") + return { + "chunking_strategy": strategy, + "table_handling": table_handling, + } + if pp_type in {"ner", "translator", "metafuse"}: + if _confirm(f"Provide extra args for `{pp_type}` as YAML?", default=False): + raw = _prompt("YAML args (single line, e.g. {key: value})", "{}") + try: + parsed = yaml.safe_load(raw) or {} + if isinstance(parsed, dict): + return parsed + except yaml.YAMLError: + pass + return {} + return {} + + +def build_postprocess_config_wizard() -> str: + """Build a postprocess config with an arbitrary list of pp_modules.""" + available = _postprocessor_choices() + modules: list[dict[str, Any]] = [] + while True: + if modules: + console.print( + f" [dim]current modules:[/] {', '.join(m['type'] for m in modules)}" + ) + pp_type = questionary.select( + "Add a post-processor module" if not modules else "Add another module", + choices=[*available, questionary.Separator(), "(done)"], + style=QSTYLE, + qmark=QMARK, + ).ask() + if pp_type is None: + raise CancelledByUser("cancelled") + if pp_type == "(done)": + break + args = _ask_module_args(pp_type) + modules.append({"type": pp_type, "args": args}) + + output_path = _prompt( + "Output JSONL path", + cwd_default("outputs/postprocess/results.jsonl"), + ) + cfg = { + "previous_results": None, + "pp_modules": modules, + "output": {"output_path": output_path, "save_each_step": True}, + } + return _save("postprocess", cfg) + + +def build_index_config_wizard(documents_path: Optional[str] = None) -> str: + dense = _prompt("Dense embedding model", "sentence-transformers/all-MiniLM-L6-v2") + sparse = _prompt("Sparse embedding model", "splade") + multimodal = _confirm("Multimodal embeddings?", default=False) + db_uri = _prompt( + "DB URI (Milvus Lite file or server URL)", cwd_default("proc_demo.db") + ) + db_name = _prompt("DB name", "my_db") + collection = _prompt("Collection name", "my_docs") + docs = documents_path or _prompt( + "Documents JSONL path", + cwd_default("outputs/postprocess/results.jsonl"), + ) + cfg = { + "indexer": { + "dense_model": {"model_name": dense, "is_multimodal": multimodal}, + "sparse_model": {"model_name": sparse, "is_multimodal": multimodal}, + "db": {"uri": db_uri, "name": db_name}, + }, + "collection_name": collection, + "documents_path": docs, + } + return _save("index", cfg) + + +def build_full_pipeline_wizard() -> dict[str, str]: + """Build process + postprocess + index configs in one flow. + + Wires the postprocess output JSONL into the index config's documents_path + so the three files form a coherent pipeline. Validates each YAML and + re-prompts on failure (the per-stage builders run again on retry). + """ + from mmore.tui.commands import REGISTRY + from mmore.tui.pipeline import _postprocess_output_jsonl + + console.print(section("Pipeline wizard", Text("step 1/3 — process", style=ACCENT2))) + while True: + process_path = build_process_config_wizard() + err = _validate_with_spinner(process_path, REGISTRY["process"]) + if err is None: + break + _show_error_panel(process_path, err) + if not _confirm("Retry the process step?", default=True): + raise CancelledByUser("cancelled") + + console.print(section("Pipeline wizard", Text("step 2/3 — postprocess", style=ACCENT2))) + while True: + pp_path = build_postprocess_config_wizard() + err = _validate_with_spinner(pp_path, REGISTRY["postprocess"]) + if err is None: + break + _show_error_panel(pp_path, err) + if not _confirm("Retry the postprocess step?", default=True): + raise CancelledByUser("cancelled") + + try: + docs_jsonl = _postprocess_output_jsonl(pp_path) + except Exception: # noqa: BLE001 + docs_jsonl = None + + console.print(section("Pipeline wizard", Text("step 3/3 — index", style=ACCENT2))) + while True: + index_path = build_index_config_wizard(documents_path=docs_jsonl) + err = _validate_with_spinner(index_path, REGISTRY["index"]) + if err is None: + break + _show_error_panel(index_path, err) + if not _confirm("Retry the index step?", default=True): + raise CancelledByUser("cancelled") + + return {"process": process_path, "postprocess": pp_path, "index": index_path} + + def find_yaml_configs(spec: CommandSpec) -> list[str]: """Find candidate YAML configs scoped to this stage. @@ -223,6 +486,19 @@ def _validate_yaml(path: str, spec: CommandSpec) -> Optional[str]: return f"{type(e).__name__}: {e}" +def _validate_with_spinner(path: str, spec: CommandSpec) -> Optional[str]: + """Same as _validate_yaml but shows a spinner — config dataclass imports + can take several seconds (heavy transitive imports), making the TUI look + frozen otherwise.""" + spinner = Spinner( + "dots", text=Text(f" Validating {spec.name} config…", style="cyan") + ) + result: dict[str, Optional[str]] = {} + with Live(spinner, console=console, refresh_per_second=12, transient=True): + result["err"] = _validate_yaml(path, spec) + return result["err"] + + def _show_error_panel(path: str, err: str) -> None: console.print( Panel( @@ -281,7 +557,7 @@ def pick_or_build_config( qmark=QMARK, ).ask() if choice is None: - raise KeyboardInterrupt + raise CancelledByUser("cancelled") path: Optional[str] = None @@ -303,7 +579,7 @@ def pick_or_build_config( qmark=QMARK, ).ask() if picked is None: - raise KeyboardInterrupt + raise CancelledByUser("cancelled") path = picked if choice == "manual": @@ -332,4 +608,4 @@ def pick_or_build_config( return path _show_error_panel(path, err) if not _confirm("Try a different config?", default=True): - raise KeyboardInterrupt + raise CancelledByUser("cancelled") diff --git a/src/mmore/tui/exceptions.py b/src/mmore/tui/exceptions.py new file mode 100644 index 00000000..905d0d25 --- /dev/null +++ b/src/mmore/tui/exceptions.py @@ -0,0 +1,11 @@ +"""TUI-only exceptions.""" + +from __future__ import annotations + + +class CancelledByUser(Exception): + """Raised when the user cancels a sub-flow (Ctrl-C or Esc inside a prompt). + + Caught by the top-level menu loop so cancellation returns to the main menu + instead of exiting the whole TUI. + """ diff --git a/src/mmore/tui/pipeline.py b/src/mmore/tui/pipeline.py index 1114a5ba..d460c651 100644 --- a/src/mmore/tui/pipeline.py +++ b/src/mmore/tui/pipeline.py @@ -83,6 +83,58 @@ def _summary_table(rows: list[tuple[str, str, float]]) -> Table: return table +def run_pipeline_with_configs( + process_cfg: str, pp_cfg: str, index_cfg: str +) -> None: + """Execute the three stages given already-built YAML paths.""" + console.print() + console.print( + section( + "Full pipeline", + Text("process → postprocess → index → (optional) chat", style=ACCENT), + style=ACCENT2, + ) + ) + + rows: list[tuple[str, str, float]] = [] + + step_header(1, 3, "process") + elapsed = _run_step( + "Crawling + extracting documents", + REGISTRY["process"].run, + config_file=process_cfg, + ) + process_jsonl = _process_output_jsonl(process_cfg) + rows.append(("process", process_jsonl, elapsed)) + + step_header(2, 3, "postprocess") + elapsed = _run_step( + "Chunking + cleaning", + REGISTRY["postprocess"].run, + config_file=pp_cfg, + input_data=process_jsonl, + ) + pp_jsonl = _postprocess_output_jsonl(pp_cfg) + rows.append(("postprocess", pp_jsonl, elapsed)) + + step_header(3, 3, "index") + elapsed = _run_step( + "Embedding + indexing into Milvus", + REGISTRY["index"].run, + config_file=index_cfg, + documents_path=pp_jsonl, + ) + rows.append(("index", "(vector DB)", elapsed)) + + console.print() + console.print(_summary_table(rows)) + console.print() + + if questionary.confirm("Open the RAG chat now?", default=True).ask(): + rag_cfg = pick_or_build_config(REGISTRY["ragcli"]) + REGISTRY["ragcli"].run(config_file=rag_cfg) + + def run_full_pipeline() -> None: console.print() console.print( @@ -95,7 +147,6 @@ def run_full_pipeline() -> None: rows: list[tuple[str, str, float]] = [] - # process step_header(1, 3, "process") process_cfg = pick_or_build_config(REGISTRY["process"]) elapsed = _run_step( @@ -106,7 +157,6 @@ def run_full_pipeline() -> None: process_jsonl = _process_output_jsonl(process_cfg) rows.append(("process", process_jsonl, elapsed)) - # postprocess step_header(2, 3, "postprocess") pp_cfg = pick_or_build_config(REGISTRY["postprocess"]) elapsed = _run_step( @@ -118,7 +168,6 @@ def run_full_pipeline() -> None: pp_jsonl = _postprocess_output_jsonl(pp_cfg) rows.append(("postprocess", pp_jsonl, elapsed)) - # index step_header(3, 3, "index") index_cfg = pick_or_build_config(REGISTRY["index"], documents_path=pp_jsonl) elapsed = _run_step( diff --git a/uv.lock b/uv.lock index 85aeac3d..933ebc23 100644 --- a/uv.lock +++ b/uv.lock @@ -3634,8 +3634,6 @@ dependencies = [ { name = "pydantic" }, { name = "python-dotenv" }, { name = "pyyaml" }, - { name = "questionary" }, - { name = "rich" }, { name = "setuptools" }, { name = "typing-extensions" }, { name = "validators" }, @@ -3690,9 +3688,11 @@ all = [ { name = "pymupdf" }, { name = "python-docx" }, { name = "python-pptx" }, + { name = "questionary" }, { name = "ragas" }, { name = "rarfile" }, { name = "requests" }, + { name = "rich" }, { name = "scipy", version = "1.15.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-5-mmore-cpu' and extra == 'extra-5-mmore-cu126')" }, { name = "scipy", version = "1.17.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-5-mmore-cpu' and extra == 'extra-5-mmore-cu126')" }, { name = "sentence-transformers" }, @@ -3800,6 +3800,10 @@ rag = [ { name = "sentence-transformers" }, { name = "transformers" }, ] +tui = [ + { name = "questionary" }, + { name = "rich" }, +] websearch = [ { name = "ddgs" }, { name = "tavily-python" }, @@ -3846,7 +3850,7 @@ requires-dist = [ { name = "marker-pdf", marker = "extra == 'process'", specifier = ">=1.6" }, { name = "milvus-model", marker = "extra == 'index'", specifier = ">=0.2.12" }, { name = "mmore", extras = ["index"], marker = "extra == 'rag'" }, - { name = "mmore", extras = ["process", "rag", "api", "websearch"], marker = "extra == 'all'" }, + { name = "mmore", extras = ["process", "rag", "api", "websearch", "tui"], marker = "extra == 'all'" }, { name = "motor", marker = "extra == 'api'", specifier = ">=3.5" }, { name = "moviepy", marker = "extra == 'process'", specifier = ">=2.0" }, { name = "nltk", marker = "extra == 'rag'", specifier = ">=3.9" }, @@ -3868,12 +3872,12 @@ requires-dist = [ { name = "python-dotenv", specifier = ">=1.0" }, { name = "python-pptx", marker = "extra == 'process'" }, { name = "pyyaml", specifier = ">=6.0" }, - { name = "questionary", specifier = ">=2.0" }, + { name = "questionary", marker = "extra == 'tui'", specifier = ">=2.0" }, { name = "ragas", marker = "extra == 'rag'", specifier = ">=0.2" }, { name = "rarfile", marker = "extra == 'process'", specifier = ">=4.1" }, { name = "requests", marker = "extra == 'api'", specifier = ">=2.31" }, { name = "requests", marker = "extra == 'process'", specifier = ">=2.31" }, - { name = "rich", specifier = ">=13" }, + { name = "rich", marker = "extra == 'tui'", specifier = ">=13" }, { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.4.0" }, { name = "scipy", marker = "extra == 'index'", specifier = ">=1.8" }, { name = "sentence-transformers", marker = "extra == 'index'" }, @@ -3898,7 +3902,7 @@ requires-dist = [ { name = "validators", specifier = ">=0.28" }, { name = "xlrd", marker = "extra == 'process'", specifier = ">=2.0.1" }, ] -provides-extras = ["process", "index", "rag", "api", "all", "cpu", "cu126", "websearch", "dev"] +provides-extras = ["process", "index", "rag", "api", "tui", "all", "cpu", "cu126", "websearch", "dev"] [[package]] name = "motor" From 628ca181086909f4677cb075a5c0b02e61ba510c Mon Sep 17 00:00:00 2001 From: Arthur PERRIN Date: Tue, 12 May 2026 08:54:31 +0200 Subject: [PATCH 04/24] =?UTF-8?q?fix(tui):=20rename=20CancelledByUser?= =?UTF-8?q?=E2=86=92UserCancelledError,=20apply=20ruff=20format?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/mmore/tui/app.py | 18 +++++++-------- src/mmore/tui/commands.py | 5 +--- src/mmore/tui/config_builder.py | 41 +++++++++++++++++---------------- src/mmore/tui/exceptions.py | 2 +- src/mmore/tui/pipeline.py | 4 +--- 5 files changed, 32 insertions(+), 38 deletions(-) diff --git a/src/mmore/tui/app.py b/src/mmore/tui/app.py index 9f414469..47d3af96 100644 --- a/src/mmore/tui/app.py +++ b/src/mmore/tui/app.py @@ -6,17 +6,16 @@ import questionary from rich.live import Live +from rich.panel import Panel from rich.spinner import Spinner from rich.text import Text -from rich.panel import Panel - from mmore.tui.commands import REGISTRY, check_stage_available from mmore.tui.config_builder import ( build_full_pipeline_wizard, pick_or_build_config, ) -from mmore.tui.exceptions import CancelledByUser +from mmore.tui.exceptions import UserCancelledError from mmore.tui.paths import cwd_default from mmore.tui.pipeline import run_full_pipeline, run_pipeline_with_configs from mmore.tui.theme import ( @@ -61,9 +60,7 @@ def _run_single_command() -> None: label = f"{spec.name:<12} — {spec.description}" if hint: label += " [dim](extras missing)[/dim]" - choices.append( - questionary.Choice(label, value=spec.name, disabled=hint) - ) + choices.append(questionary.Choice(label, value=spec.name, disabled=hint)) else: choices.append(questionary.Choice(label, value=spec.name)) name = questionary.select( @@ -137,14 +134,15 @@ def _run_full_wizard() -> None: style=QSTYLE, qmark=QMARK, ).ask(): - run_pipeline_with_configs(paths["process"], paths["postprocess"], paths["index"]) + run_pipeline_with_configs( + paths["process"], paths["postprocess"], paths["index"] + ) def _pipeline_hint() -> str | None: """Return a combined hint if any of process/postprocess/index is missing.""" hints = [ - check_stage_available(REGISTRY[s]) - for s in ("process", "postprocess", "index") + check_stage_available(REGISTRY[s]) for s in ("process", "postprocess", "index") ] hints = [h for h in hints if h] return " | ".join(hints) if hints else None @@ -210,7 +208,7 @@ def run() -> None: _run_full_wizard() elif mode == "chat": _chat_only() - except (CancelledByUser, KeyboardInterrupt): + except (UserCancelledError, KeyboardInterrupt): console.print(f"[{ACCENT2}]cancelled — back to menu.[/]") continue except Exception as e: # noqa: BLE001 diff --git a/src/mmore/tui/commands.py b/src/mmore/tui/commands.py index b498e351..650cbf6d 100644 --- a/src/mmore/tui/commands.py +++ b/src/mmore/tui/commands.py @@ -41,10 +41,7 @@ def check_stage_available(spec: "CommandSpec") -> Optional[str]: if not missing: return None extras = " ".join(f"--extra {e}" for e in spec.required_extras) - return ( - f"Missing: {', '.join(missing)}. " - f"Install with: uv sync {extras}".strip() - ) + return f"Missing: {', '.join(missing)}. Install with: uv sync {extras}".strip() def _process(config_file: str, **_): diff --git a/src/mmore/tui/config_builder.py b/src/mmore/tui/config_builder.py index e9b257d6..09e57e58 100644 --- a/src/mmore/tui/config_builder.py +++ b/src/mmore/tui/config_builder.py @@ -21,13 +21,13 @@ from rich.text import Text from mmore.tui.commands import CommandSpec -from mmore.tui.exceptions import CancelledByUser +from mmore.tui.exceptions import UserCancelledError from mmore.tui.paths import cwd_default, repo_root, resolve_example from mmore.tui.theme import ACCENT2, QMARK, QSTYLE, console, section def _ask(prompt_obj: Any) -> Any: - """Call .ask() and translate Ctrl-C / Esc into CancelledByUser. + """Call .ask() and translate Ctrl-C / Esc into UserCancelledError. questionary raises KeyboardInterrupt on Ctrl-C and returns None on Esc. Both should land us back at the main menu, not exit the TUI. @@ -35,11 +35,12 @@ def _ask(prompt_obj: Any) -> Any: try: answer = prompt_obj.ask() except KeyboardInterrupt as e: - raise CancelledByUser("cancelled") from e + raise UserCancelledError("cancelled") from e if answer is None: - raise CancelledByUser("cancelled") + raise UserCancelledError("cancelled") return answer + CONFIG_DIR = Path("./tui-configs") @@ -130,7 +131,7 @@ def build_postprocess_config() -> str: qmark=QMARK, ).ask() if strategy is None: - raise CancelledByUser("cancelled") + raise UserCancelledError("cancelled") table_handling = questionary.select( "Table handling", choices=["single_row", "multi_rows", "keep_whole", "none"], @@ -139,7 +140,7 @@ def build_postprocess_config() -> str: qmark=QMARK, ).ask() if table_handling is None: - raise CancelledByUser("cancelled") + raise UserCancelledError("cancelled") output_path = _prompt( "Output JSONL path", cwd_default("outputs/postprocess/results.jsonl"), @@ -251,7 +252,7 @@ def build_process_config_wizard() -> str: qmark=QMARK, ).ask() if selected is None: - raise CancelledByUser("cancelled") + raise UserCancelledError("cancelled") if not selected: selected = names # empty would mean a no-op pipeline; fall back to all @@ -271,9 +272,7 @@ def build_process_config_wizard() -> str: sizes.append({name: value}) processor_config = { - name: cfg - for name, cfg in _PROCESSOR_DEFAULT_CONFIG.items() - if name in selected + name: cfg for name, cfg in _PROCESSOR_DEFAULT_CONFIG.items() if name in selected } cfg = { @@ -311,7 +310,7 @@ def _ask_module_args(pp_type: str) -> dict[str, Any]: qmark=QMARK, ).ask() if strategy is None: - raise CancelledByUser("cancelled") + raise UserCancelledError("cancelled") table_handling = questionary.select( "Table handling", choices=["single_row", "multi_rows", "keep_whole", "none"], @@ -320,7 +319,7 @@ def _ask_module_args(pp_type: str) -> dict[str, Any]: qmark=QMARK, ).ask() if table_handling is None: - raise CancelledByUser("cancelled") + raise UserCancelledError("cancelled") return { "chunking_strategy": strategy, "table_handling": table_handling, @@ -354,7 +353,7 @@ def build_postprocess_config_wizard() -> str: qmark=QMARK, ).ask() if pp_type is None: - raise CancelledByUser("cancelled") + raise UserCancelledError("cancelled") if pp_type == "(done)": break args = _ask_module_args(pp_type) @@ -415,9 +414,11 @@ def build_full_pipeline_wizard() -> dict[str, str]: break _show_error_panel(process_path, err) if not _confirm("Retry the process step?", default=True): - raise CancelledByUser("cancelled") + raise UserCancelledError("cancelled") - console.print(section("Pipeline wizard", Text("step 2/3 — postprocess", style=ACCENT2))) + console.print( + section("Pipeline wizard", Text("step 2/3 — postprocess", style=ACCENT2)) + ) while True: pp_path = build_postprocess_config_wizard() err = _validate_with_spinner(pp_path, REGISTRY["postprocess"]) @@ -425,7 +426,7 @@ def build_full_pipeline_wizard() -> dict[str, str]: break _show_error_panel(pp_path, err) if not _confirm("Retry the postprocess step?", default=True): - raise CancelledByUser("cancelled") + raise UserCancelledError("cancelled") try: docs_jsonl = _postprocess_output_jsonl(pp_path) @@ -440,7 +441,7 @@ def build_full_pipeline_wizard() -> dict[str, str]: break _show_error_panel(index_path, err) if not _confirm("Retry the index step?", default=True): - raise CancelledByUser("cancelled") + raise UserCancelledError("cancelled") return {"process": process_path, "postprocess": pp_path, "index": index_path} @@ -557,7 +558,7 @@ def pick_or_build_config( qmark=QMARK, ).ask() if choice is None: - raise CancelledByUser("cancelled") + raise UserCancelledError("cancelled") path: Optional[str] = None @@ -579,7 +580,7 @@ def pick_or_build_config( qmark=QMARK, ).ask() if picked is None: - raise CancelledByUser("cancelled") + raise UserCancelledError("cancelled") path = picked if choice == "manual": @@ -608,4 +609,4 @@ def pick_or_build_config( return path _show_error_panel(path, err) if not _confirm("Try a different config?", default=True): - raise CancelledByUser("cancelled") + raise UserCancelledError("cancelled") diff --git a/src/mmore/tui/exceptions.py b/src/mmore/tui/exceptions.py index 905d0d25..eb310dae 100644 --- a/src/mmore/tui/exceptions.py +++ b/src/mmore/tui/exceptions.py @@ -3,7 +3,7 @@ from __future__ import annotations -class CancelledByUser(Exception): +class UserCancelledError(Exception): """Raised when the user cancels a sub-flow (Ctrl-C or Esc inside a prompt). Caught by the top-level menu loop so cancellation returns to the main menu diff --git a/src/mmore/tui/pipeline.py b/src/mmore/tui/pipeline.py index d460c651..ded98048 100644 --- a/src/mmore/tui/pipeline.py +++ b/src/mmore/tui/pipeline.py @@ -83,9 +83,7 @@ def _summary_table(rows: list[tuple[str, str, float]]) -> Table: return table -def run_pipeline_with_configs( - process_cfg: str, pp_cfg: str, index_cfg: str -) -> None: +def run_pipeline_with_configs(process_cfg: str, pp_cfg: str, index_cfg: str) -> None: """Execute the three stages given already-built YAML paths.""" console.print() console.print( From 9d996fa8068f9beb28ec2eab3cb757413dcc96a2 Mon Sep 17 00:00:00 2001 From: Arthur PERRIN Date: Tue, 12 May 2026 12:20:13 +0200 Subject: [PATCH 05/24] fix: adding wizard config for single command --- src/mmore/tui/config_builder.py | 147 ++++++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) diff --git a/src/mmore/tui/config_builder.py b/src/mmore/tui/config_builder.py index 09e57e58..69aacba2 100644 --- a/src/mmore/tui/config_builder.py +++ b/src/mmore/tui/config_builder.py @@ -186,10 +186,157 @@ def build_index_config(documents_path: Optional[str] = None) -> str: return _save("index", cfg) +def build_rag_config() -> str: + """Wizard for `rag` / `retrieve` / `ragcli` configs.""" + llm_name = _prompt("LLM name", "OpenMeditron/meditron3-8b") + max_new_tokens_raw = _prompt("Max new tokens", "1200") + try: + max_new_tokens = int(max_new_tokens_raw) + except ValueError: + max_new_tokens = 1200 + + db_uri = _prompt( + "DB URI (Milvus Lite file or server URL)", cwd_default("proc_demo.db") + ) + db_name = _prompt("DB name", "my_db") + collection = _prompt("Collection name", "my_docs") + k_raw = _prompt("Number of docs to retrieve (k)", "5") + try: + k = int(k_raw) + except ValueError: + k = 5 + hybrid_raw = _prompt("Hybrid search weight (0.0 dense — 1.0 sparse)", "0.5") + try: + hybrid = float(hybrid_raw) + except ValueError: + hybrid = 0.5 + use_web = _confirm("Augment retrieval with web search?", default=False) + reranker = _prompt("Reranker model (blank to skip)", "BAAI/bge-reranker-base") + + mode = questionary.select( + "Run mode", + choices=["local", "api"], + default="local", + style=QSTYLE, + qmark=QMARK, + ).ask() + if mode is None: + raise UserCancelledError("cancelled") + + cfg: dict[str, Any] = { + "rag": { + "llm": {"llm_name": llm_name, "max_new_tokens": max_new_tokens}, + "retriever": { + "db": {"uri": db_uri, "name": db_name}, + "hybrid_search_weight": hybrid, + "k": k, + "collection_name": collection, + "use_web": use_web, + "reranker_model_name": reranker or None, + }, + "system_prompt": ( + "Use the following context to answer the questions.\n\n" + "Context:\n{context}" + ), + }, + "mode": mode, + } + if mode == "local": + input_file = _prompt( + "Queries JSONL path", cwd_default("examples/rag/queries.jsonl") + ) + output_file = _prompt( + "Output JSON path", cwd_default("outputs/rag/output.json") + ) + cfg["mode_args"] = {"input_file": input_file, "output_file": output_file} + else: + port_raw = _prompt("API port", "8000") + try: + port = int(port_raw) + except ValueError: + port = 8000 + cfg["mode_args"] = { + "endpoint": "/rag", + "host": "0.0.0.0", + "port": port, + } + return _save("rag", cfg) + + +def build_websearch_config() -> str: + """Wizard for `websearch` configs.""" + use_rag = _confirm("Combine web search with RAG?", default=True) + rag_path = "" + if use_rag: + rag_path = _prompt( + "Path to a RAG config YAML", + cwd_default("examples/rag/config.yaml"), + ) + llm_name = _prompt("LLM name", "OpenMeditron/meditron3-8b") + max_new_tokens_raw = _prompt("Max new tokens", "1200") + try: + max_new_tokens = int(max_new_tokens_raw) + except ValueError: + max_new_tokens = 1200 + input_queries = _prompt( + "Input queries JSONL", cwd_default("examples/rag/queries.jsonl") + ) + output_file = _prompt( + "Output JSON path", + cwd_default("outputs/websearch/enhanced_results.json"), + ) + n_subqueries_raw = _prompt("Number of sub-queries per question", "2") + try: + n_subqueries = int(n_subqueries_raw) + except ValueError: + n_subqueries = 2 + max_searches_raw = _prompt("Max searches per query", "5") + try: + max_searches = int(max_searches_raw) + except ValueError: + max_searches = 5 + provider = questionary.select( + "Search provider", + choices=["duckduckgo"], + default="duckduckgo", + style=QSTYLE, + qmark=QMARK, + ).ask() + if provider is None: + raise UserCancelledError("cancelled") + + cfg: dict[str, Any] = { + "websearch": { + "use_rag": use_rag, + "rag_config_path": rag_path, + "use_summary": True, + "n_subqueries": n_subqueries, + "input_queries": input_queries, + "output_file": output_file, + "n_loops": 2, + "max_searches": max_searches, + "search_provider": provider, + "max_retries": 3, + "max_context_tokens": 2048, + "fast_tokenizer": False, + "mode": "local", + "llm_config": { + "llm_name": llm_name, + "max_new_tokens": max_new_tokens, + }, + } + } + return _save("websearch", cfg) + + BUILDERS = { "process": build_process_config, "postprocess": build_postprocess_config, "index": build_index_config, + "rag": build_rag_config, + "retrieve": build_rag_config, + "ragcli": build_rag_config, + "websearch": build_websearch_config, } From c303eee2870365cc111862da40418a46eb46583e Mon Sep 17 00:00:00 2001 From: Arthur PERRIN Date: Tue, 12 May 2026 12:22:46 +0200 Subject: [PATCH 06/24] docs: for_devs updated --- .../developer_documentation/for_devs.md | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/docs/source/developer_documentation/for_devs.md b/docs/source/developer_documentation/for_devs.md index ecd179c4..5d9949ff 100644 --- a/docs/source/developer_documentation/for_devs.md +++ b/docs/source/developer_documentation/for_devs.md @@ -31,6 +31,7 @@ This guide will help you set up your development environment and contribute to t - [Writing tests](#writing-tests) - [🔀 Pull Request Process](#-pull-request-process) - [PR checklist](#pr-checklist) + - [🖥️ Interactive TUI](#️-interactive-tui) - [💡 Development tips](#-development-tips) - [Working with `uv`](#working-with-uv) - [❓ Questions](#-questions) @@ -256,6 +257,25 @@ def test_something_on_gpu(): - [ ] Examples are provided for new features - [ ] Commit messages are clear and descriptive +## 🖥️ Interactive TUI + +MMORE ships with a Terminal UI that wraps the CLI commands behind guided menus and config wizards. Useful for trying the pipeline without writing YAML by hand. + +Launch it from a project working directory: + +```bash +mmore tui +``` + +From the main menu you can: + +- **Run a single command** — pick any stage (`process`, `postprocess`, `index`, `retrieve`, `rag`, `ragcli`, `websearch`), then either select an existing YAML, generate one through a guided wizard, or type a path manually. Generated configs are written to `./tui-configs/` and validated against the stage's dataclass before running. +- **Run full pipeline** — chains `process → postprocess → index` using existing configs. +- **Build a full pipeline config (guided wizard)** — walks through the three stages in order, wiring the postprocess output JSONL into the index config automatically. +- **Chat with indexed documents** — shortcut to `ragcli`. + +Stages whose extras are missing are disabled in the menu with an install hint (e.g. `uv sync --extra rag --extra cpu`). Press `Ctrl-C` inside any sub-flow to cancel back to the main menu; press it again at the main menu to quit. + ## 💡 Development tips ### Working with `uv` From 61b773e27863537ba474df330e49150b7b25a720 Mon Sep 17 00:00:00 2001 From: Arthur PERRIN Date: Tue, 12 May 2026 13:18:40 +0200 Subject: [PATCH 07/24] feat(tui): config preview, JSONL inspector, incremental resume, $EDITOR edit - Add post-validation menu (preview / edit / run) when picking a config: syntax-highlighted YAML preview via rich.Syntax, and $EDITOR launch with automatic re-validation on save. - New inspector module (src/mmore/tui/inspector.py) that streams JSONL output files and prints a summary table (doc count, processor types, file types, avg text length, modalities) plus a sample of the first 3 documents. Called automatically after process and postprocess steps. - Wizard builders now detect existing output files and propose resuming via previous_results instead of always writing null, leveraging the existing incremental.py module for skipping unchanged files. - Add "Edit an existing YAML in $EDITOR" choice to pick_or_build_config. --- src/mmore/tui/config_builder.py | 132 ++++++++++++++++++++++++++------ src/mmore/tui/inspector.py | 126 ++++++++++++++++++++++++++++++ src/mmore/tui/pipeline.py | 5 ++ 3 files changed, 238 insertions(+), 25 deletions(-) create mode 100644 src/mmore/tui/inspector.py diff --git a/src/mmore/tui/config_builder.py b/src/mmore/tui/config_builder.py index 69aacba2..8fa6e875 100644 --- a/src/mmore/tui/config_builder.py +++ b/src/mmore/tui/config_builder.py @@ -9,6 +9,7 @@ from __future__ import annotations import os +import subprocess import time from pathlib import Path from typing import Any, Optional @@ -18,12 +19,13 @@ from rich.live import Live from rich.panel import Panel from rich.spinner import Spinner +from rich.syntax import Syntax from rich.text import Text from mmore.tui.commands import CommandSpec from mmore.tui.exceptions import UserCancelledError from mmore.tui.paths import cwd_default, repo_root, resolve_example -from mmore.tui.theme import ACCENT2, QMARK, QSTYLE, console, section +from mmore.tui.theme import ACCENT, ACCENT2, QMARK, QSTYLE, console, section def _ask(prompt_obj: Any) -> Any: @@ -62,6 +64,58 @@ def _save(name: str, data: dict[str, Any]) -> str: return str(path) +def _preview_config(path: str) -> None: + """Display a YAML file with syntax highlighting.""" + content = Path(path).read_text() + console.print( + Panel( + Syntax(content, "yaml", theme="monokai", line_numbers=True), + title=f"[bold]{path}[/bold]", + border_style=ACCENT, + padding=(1, 2), + ) + ) + + +def _edit_config(path: str) -> None: + """Open a config file in $EDITOR (falls back to vi).""" + editor = os.environ.get("EDITOR", "vi") + subprocess.call([editor, path]) + + +def _post_validation_menu(path: str, spec: CommandSpec) -> str: + """After validation, let the user preview, edit, or run the config. + + Returns the (potentially re-validated) path. + """ + while True: + action = _ask( + questionary.select( + "What next?", + choices=[ + questionary.Choice("▶ Run with this config", value="run"), + questionary.Choice("👁 Preview config", value="preview"), + questionary.Choice("✎ Edit in $EDITOR", value="edit"), + ], + default="▶ Run with this config", + style=QSTYLE, + qmark=QMARK, + ) + ) + if action == "run": + return path + if action == "preview": + _preview_config(path) + continue + if action == "edit": + _edit_config(path) + err = _validate_with_spinner(path, spec) + if err: + _show_error_panel(path, err) + continue + return path # unreachable but keeps mypy happy + + def build_process_config() -> str: data_path = _prompt( "Data path (folder with documents to process)", @@ -422,10 +476,19 @@ def build_process_config_wizard() -> str: name: cfg for name, cfg in _PROCESSOR_DEFAULT_CONFIG.items() if name in selected } + # Incremental resume: detect previous results + previous_results = None + prev_path = os.path.join(output_path, "merged", "merged_results.jsonl") + if os.path.exists(prev_path) and _confirm( + f"Previous results found at {prev_path}. Resume (skip unchanged files)?", + default=True, + ): + previous_results = prev_path + cfg = { "data_path": data_path, "google_drive_ids": [], - "previous_results": None, + "previous_results": previous_results, "dispatcher_config": { "output_path": output_path, "use_fast_processors": use_fast, @@ -510,8 +573,22 @@ def build_postprocess_config_wizard() -> str: "Output JSONL path", cwd_default("outputs/postprocess/results.jsonl"), ) + + # Incremental resume: detect previous results + previous_results = None + # Resolve the actual JSONL path (dir → dir/final.jsonl, .jsonl → as-is) + if output_path.endswith(".jsonl"): + pp_prev_path = output_path + else: + pp_prev_path = os.path.join(output_path, "final.jsonl") + if os.path.exists(pp_prev_path) and _confirm( + f"Previous results found at {pp_prev_path}. Resume (skip unchanged)?", + default=True, + ): + previous_results = pp_prev_path + cfg = { - "previous_results": None, + "previous_results": previous_results, "pp_modules": modules, "output": {"output_path": output_path, "save_each_step": True}, } @@ -694,22 +771,25 @@ def pick_or_build_config( on failure rather than letting the run blow up later. """ while True: - choice = questionary.select( - f"Config for `{spec.name}`?", - choices=[ - questionary.Choice("📂 Pick existing YAML", value="pick"), - questionary.Choice("✨ Generate new YAML (guided)", value="build"), - questionary.Choice("⌨ Type a path manually", value="manual"), - ], - style=QSTYLE, - qmark=QMARK, - ).ask() - if choice is None: - raise UserCancelledError("cancelled") + choice = _ask( + questionary.select( + f"Config for `{spec.name}`?", + choices=[ + questionary.Choice("📂 Pick existing YAML", value="pick"), + questionary.Choice("✨ Generate new YAML (guided)", value="build"), + questionary.Choice( + "✎ Edit an existing YAML in $EDITOR", value="edit" + ), + questionary.Choice("⌨ Type a path manually", value="manual"), + ], + style=QSTYLE, + qmark=QMARK, + ) + ) path: Optional[str] = None - if choice == "pick": + if choice in ("pick", "edit"): candidates = find_yaml_configs(spec) ranked = _ranked_choices(spec, candidates) if not ranked: @@ -720,15 +800,17 @@ def pick_or_build_config( ) choice = "manual" else: - picked = questionary.select( - f"Select a config for `{spec.name}`", - choices=ranked, - style=QSTYLE, - qmark=QMARK, - ).ask() - if picked is None: - raise UserCancelledError("cancelled") + picked = _ask( + questionary.select( + f"Select a config for `{spec.name}`", + choices=ranked, + style=QSTYLE, + qmark=QMARK, + ) + ) path = picked + if choice == "edit": + _edit_config(path) if choice == "manual": manual = _prompt("Path to YAML config") @@ -753,7 +835,7 @@ def pick_or_build_config( assert path is not None err = _validate_yaml(path, spec) if err is None: - return path + return _post_validation_menu(path, spec) _show_error_panel(path, err) if not _confirm("Try a different config?", default=True): raise UserCancelledError("cancelled") diff --git a/src/mmore/tui/inspector.py b/src/mmore/tui/inspector.py new file mode 100644 index 00000000..2d0dd033 --- /dev/null +++ b/src/mmore/tui/inspector.py @@ -0,0 +1,126 @@ +"""Lightweight JSONL inspector for TUI result previews. + +Streams the file line-by-line (no heavy imports like torch/transformers) +and prints a rich summary table + sample documents. +""" + +from __future__ import annotations + +import json +import os +from collections import Counter +from pathlib import Path +from typing import Any + +from rich.panel import Panel +from rich.table import Table +from rich.text import Text + +from mmore.tui.theme import ACCENT, ACCENT2, MUTED, console + + +def _iter_dicts(path: str): + """Yield raw dicts from a JSONL file without importing MultimodalSample.""" + with open(path) as f: + for line in f: + line = line.strip() + if line: + yield json.loads(line) + + +def inspect_jsonl(path: str, max_samples: int = 3) -> None: + """Print a summary of a JSONL file: counts, breakdowns, sample docs.""" + if not os.path.exists(path): + console.print(f" [dim]no output file at {path}[/dim]") + return + + total = 0 + processor_types: Counter[str] = Counter() + file_extensions: Counter[str] = Counter() + modality_types: Counter[str] = Counter() + total_text_len = 0 + samples: list[dict[str, Any]] = [] + + for doc in _iter_dicts(path): + total += 1 + + meta = doc.get("metadata", {}) + pt = meta.get("processor_type", "unknown") + processor_types[pt] += 1 + + fp = meta.get("file_path", "") + ext = Path(fp).suffix.lower() if fp else "(none)" + file_extensions[ext] += 1 + + text = doc.get("text", "") + if isinstance(text, str): + total_text_len += len(text) + + for mod in doc.get("modalities", []): + modality_types[mod.get("type", "unknown")] += 1 + + if len(samples) < max_samples: + samples.append(doc) + + if total == 0: + console.print(" [dim]empty JSONL (0 documents)[/dim]") + return + + # --- Stats table --- + table = Table( + title="[bold]Results summary[/bold]", + title_style=ACCENT2, + border_style=ACCENT, + header_style=f"bold {ACCENT}", + show_lines=False, + padding=(0, 2), + ) + table.add_column("Metric", style="bold") + table.add_column("Value") + + table.add_row("Total documents", str(total)) + table.add_row("Avg text length", f"{total_text_len // total:,} chars") + + if processor_types: + breakdown = ", ".join(f"{k}: {v}" for k, v in processor_types.most_common()) + table.add_row("Processor types", breakdown) + + if file_extensions: + breakdown = ", ".join(f"{k}: {v}" for k, v in file_extensions.most_common()) + table.add_row("File types", breakdown) + + if modality_types: + breakdown = ", ".join(f"{k}: {v}" for k, v in modality_types.most_common()) + table.add_row("Modalities", breakdown) + + console.print() + console.print(table) + + # --- Sample documents --- + if samples: + sample_text = Text() + for i, doc in enumerate(samples, 1): + meta = doc.get("metadata", {}) + fp = meta.get("file_path", "?") + pt = meta.get("processor_type", "?") + text = doc.get("text", "") + if isinstance(text, str): + preview = text[:200].replace("\n", " ") + if len(text) > 200: + preview += "…" + else: + preview = "(structured content)" + sample_text.append(f"#{i} ", style="bold") + sample_text.append(f"{fp} ") + sample_text.append(f"({pt})", style="dim") + sample_text.append("\n") + sample_text.append(preview + "\n\n", style=MUTED) + + console.print( + Panel( + sample_text, + title=f"[bold]Sample documents (first {len(samples)})[/bold]", + border_style=ACCENT, + padding=(1, 2), + ) + ) diff --git a/src/mmore/tui/pipeline.py b/src/mmore/tui/pipeline.py index ded98048..d79f83ce 100644 --- a/src/mmore/tui/pipeline.py +++ b/src/mmore/tui/pipeline.py @@ -13,6 +13,7 @@ from mmore.tui.commands import REGISTRY from mmore.tui.config_builder import pick_or_build_config +from mmore.tui.inspector import inspect_jsonl from mmore.tui.theme import ( ACCENT, ACCENT2, @@ -104,6 +105,7 @@ def run_pipeline_with_configs(process_cfg: str, pp_cfg: str, index_cfg: str) -> ) process_jsonl = _process_output_jsonl(process_cfg) rows.append(("process", process_jsonl, elapsed)) + inspect_jsonl(process_jsonl) step_header(2, 3, "postprocess") elapsed = _run_step( @@ -114,6 +116,7 @@ def run_pipeline_with_configs(process_cfg: str, pp_cfg: str, index_cfg: str) -> ) pp_jsonl = _postprocess_output_jsonl(pp_cfg) rows.append(("postprocess", pp_jsonl, elapsed)) + inspect_jsonl(pp_jsonl) step_header(3, 3, "index") elapsed = _run_step( @@ -154,6 +157,7 @@ def run_full_pipeline() -> None: ) process_jsonl = _process_output_jsonl(process_cfg) rows.append(("process", process_jsonl, elapsed)) + inspect_jsonl(process_jsonl) step_header(2, 3, "postprocess") pp_cfg = pick_or_build_config(REGISTRY["postprocess"]) @@ -165,6 +169,7 @@ def run_full_pipeline() -> None: ) pp_jsonl = _postprocess_output_jsonl(pp_cfg) rows.append(("postprocess", pp_jsonl, elapsed)) + inspect_jsonl(pp_jsonl) step_header(3, 3, "index") index_cfg = pick_or_build_config(REGISTRY["index"], documents_path=pp_jsonl) From f521702ceccf794901f4b924bfbf3f52d4a6b957 Mon Sep 17 00:00:00 2001 From: Mathieu Date: Tue, 12 May 2026 16:38:36 +0200 Subject: [PATCH 08/24] disable commands when pipeline extras are missing | big fix --- src/mmore/tui/app.py | 79 ++++++++++++++++++++++++++++----- src/mmore/tui/config_builder.py | 18 +++++--- src/mmore/tui/theme.py | 1 + 3 files changed, 83 insertions(+), 15 deletions(-) diff --git a/src/mmore/tui/app.py b/src/mmore/tui/app.py index 47d3af96..4a5b33e0 100644 --- a/src/mmore/tui/app.py +++ b/src/mmore/tui/app.py @@ -45,6 +45,39 @@ def _show_missing_extras(spec_name: str, hint: str) -> None: ) +def _missing_extras_notice() -> Panel | None: + """One-line-per-install-command notice — kept compact so the banner stays visible.""" + install_to_stages: dict[str, list[str]] = {} + for name, spec in REGISTRY.items(): + hint = check_stage_available(spec) + if hint and "Install with: " in hint: + cmd = hint.split("Install with: ", 1)[1].strip() + install_to_stages.setdefault(cmd, []).append(name) + + if not install_to_stages: + return None + + body = Text() + for i, (cmd, stages) in enumerate(install_to_stages.items()): + if i > 0: + body.append("\n") + body.append(", ".join(stages), style="bold white") + body.append(" → ", style="yellow") + body.append(cmd, style="cyan") + + return Panel( + body, + title="[bold yellow]⚠ missing extras[/]", + border_style="yellow", + padding=(0, 1), + ) + + +def _disabled_label(label: str) -> str: + """Prefix a menu label so its disabled state is immediately readable.""" + return f"⚠ {label}" + + def _run_with_spinner(label: str, fn, **kwargs) -> None: start = time.time() spinner = Spinner("dots", text=Text(f" {label}…", style=ACCENT)) @@ -55,14 +88,29 @@ def _run_with_spinner(label: str, fn, **kwargs) -> None: def _run_single_command() -> None: choices = [] + enabled_count = 0 for spec in REGISTRY.values(): hint = check_stage_available(spec) label = f"{spec.name:<12} — {spec.description}" if hint: - label += " [dim](extras missing)[/dim]" - choices.append(questionary.Choice(label, value=spec.name, disabled=hint)) + choices.append( + questionary.Choice( + _disabled_label(label), value=spec.name, disabled="missing extras" + ) + ) else: choices.append(questionary.Choice(label, value=spec.name)) + enabled_count += 1 + + # questionary crashes ("InquirerControl has no attribute 'pointed_at'") when + # every choice is disabled because it can't pick an initial pointer. Bail + # out with a clear notice instead. + if enabled_count == 0: + notice = _missing_extras_notice() + if notice is not None: + console.print(notice) + return + name = questionary.select( "Pick a command", choices=choices, @@ -149,24 +197,35 @@ def _pipeline_hint() -> str | None: def _main_menu() -> str | None: + notice = _missing_extras_notice() + if notice is not None: + console.print(notice) + pipeline_hint = _pipeline_hint() chat_hint = check_stage_available(REGISTRY["ragcli"]) + # The wizard validates each generated YAML against the stage's dataclass, + # which transitively imports torch / transformers / etc. — so it needs the + # same extras as the full pipeline. Reuse `_pipeline_hint()` to stay aligned. + wizard_hint = _pipeline_hint() + + pipeline_label = "🚀 Run full pipeline (process → postprocess → index)" + wizard_label = "🧙 Build a full pipeline config (guided wizard)" + chat_label = "💬 Chat with indexed documents" pipeline_choice = questionary.Choice( - "🚀 Run full pipeline (process → postprocess → index)" - + (" [dim](extras missing)[/dim]" if pipeline_hint else ""), + _disabled_label(pipeline_label) if pipeline_hint else pipeline_label, value="pipeline", - disabled=pipeline_hint, + disabled="missing extras" if pipeline_hint else None, ) wizard_choice = questionary.Choice( - "🧙 Build a full pipeline config (guided wizard)", + _disabled_label(wizard_label) if wizard_hint else wizard_label, value="wizard", - ) # wizard only writes YAML, no heavy imports needed + disabled="missing extras" if wizard_hint else None, + ) chat_choice = questionary.Choice( - "💬 Chat with indexed documents" - + (" [dim](extras missing)[/dim]" if chat_hint else ""), + _disabled_label(chat_label) if chat_hint else chat_label, value="chat", - disabled=chat_hint, + disabled="missing extras" if chat_hint else None, ) return questionary.select( diff --git a/src/mmore/tui/config_builder.py b/src/mmore/tui/config_builder.py index 8fa6e875..37eaddc4 100644 --- a/src/mmore/tui/config_builder.py +++ b/src/mmore/tui/config_builder.py @@ -97,7 +97,7 @@ def _post_validation_menu(path: str, spec: CommandSpec) -> str: questionary.Choice("👁 Preview config", value="preview"), questionary.Choice("✎ Edit in $EDITOR", value="edit"), ], - default="▶ Run with this config", + default="run", style=QSTYLE, qmark=QMARK, ) @@ -503,11 +503,19 @@ def build_process_config_wizard() -> str: def _postprocessor_choices() -> list[str]: - """Enumerate every post-processor `type` string the loader accepts.""" - from mmore.process.post_processor.filter import FILTER_TYPES - from mmore.process.post_processor.tagger import TAGGER_TYPES + """Enumerate every post-processor `type` string the loader accepts. - return ["chunker", "ner", "translator", "metafuse", *TAGGER_TYPES, *FILTER_TYPES] + The wizard is reachable without the `process` extra installed (it only + writes YAML), so we fall back to the core set if the extra modules are + missing instead of crashing mid-wizard with an ImportError. + """ + base = ["chunker", "ner", "translator", "metafuse"] + try: + from mmore.process.post_processor.filter import FILTER_TYPES + from mmore.process.post_processor.tagger import TAGGER_TYPES + except ImportError: + return base + return [*base, *TAGGER_TYPES, *FILTER_TYPES] def _ask_module_args(pp_type: str) -> dict[str, Any]: diff --git a/src/mmore/tui/theme.py b/src/mmore/tui/theme.py index d719c351..b3710278 100644 --- a/src/mmore/tui/theme.py +++ b/src/mmore/tui/theme.py @@ -19,6 +19,7 @@ ("highlighted", "fg:#5fd7ff bold"), ("selected", "fg:#ff5fd7"), ("instruction", "fg:#808080 italic"), + ("disabled", "fg:#ffaf00 italic"), ] ) QMARK = "▸" From 0d78172cb20b19cc349e859b59319b5b0baf70bb Mon Sep 17 00:00:00 2001 From: Arthur PERRIN Date: Wed, 13 May 2026 23:29:22 +0200 Subject: [PATCH 09/24] fix(tui): address Copilot review comments - Use time.time_ns() for config filenames to avoid collisions - Support $EDITOR with flags (e.g. "code -w") via shlex.split - Use _validate_with_spinner in pick_or_build_config to show feedback during slow dataclass imports - Expand ~ and env vars on manual path input (expanduser/expandvars) - Replace cwd_default("examples/...") with resolve_example() so defaults resolve correctly from any CWD - Narrow ImportError catch in cli.py to ModuleNotFoundError for expected TUI deps only, re-raise other import errors - Fix paths.py docstring to match actual implementation (no importlib.resources fallback) --- src/mmore/cli.py | 13 +++++++------ src/mmore/tui/config_builder.py | 19 ++++++++++++------- src/mmore/tui/paths.py | 8 +++----- 3 files changed, 22 insertions(+), 18 deletions(-) diff --git a/src/mmore/cli.py b/src/mmore/cli.py index 080b4be9..d6333a69 100644 --- a/src/mmore/cli.py +++ b/src/mmore/cli.py @@ -270,12 +270,13 @@ def tui(): """Launch the interactive Terminal UI.""" try: from .tui import run - except ImportError as e: - click.echo( - f"TUI dependencies missing ({e.name or e}). " - "Install with: uv sync --extra tui" - ) - raise SystemExit(1) + except ModuleNotFoundError as e: + if e.name in ("questionary", "rich", "prompt_toolkit"): + click.echo( + f"TUI dependency missing ({e.name}). Install with: uv sync --extra tui" + ) + raise SystemExit(1) + raise run() diff --git a/src/mmore/tui/config_builder.py b/src/mmore/tui/config_builder.py index 37eaddc4..8f38a9b1 100644 --- a/src/mmore/tui/config_builder.py +++ b/src/mmore/tui/config_builder.py @@ -9,6 +9,7 @@ from __future__ import annotations import os +import shlex import subprocess import time from pathlib import Path @@ -58,7 +59,7 @@ def _confirm(question: str, default: bool = False) -> bool: def _save(name: str, data: dict[str, Any]) -> str: CONFIG_DIR.mkdir(parents=True, exist_ok=True) - path = CONFIG_DIR / f"{name}-{int(time.time())}.yaml" + path = CONFIG_DIR / f"{name}-{time.time_ns()}.yaml" with open(path, "w") as f: yaml.safe_dump(data, f, sort_keys=False) return str(path) @@ -78,9 +79,12 @@ def _preview_config(path: str) -> None: def _edit_config(path: str) -> None: - """Open a config file in $EDITOR (falls back to vi).""" + """Open a config file in $EDITOR (falls back to vi). + + Supports editors with flags like ``EDITOR="code -w"`` via shlex.split. + """ editor = os.environ.get("EDITOR", "vi") - subprocess.call([editor, path]) + subprocess.call([*shlex.split(editor), path]) def _post_validation_menu(path: str, spec: CommandSpec) -> str: @@ -297,7 +301,7 @@ def build_rag_config() -> str: } if mode == "local": input_file = _prompt( - "Queries JSONL path", cwd_default("examples/rag/queries.jsonl") + "Queries JSONL path", resolve_example("examples/rag/queries.jsonl") ) output_file = _prompt( "Output JSON path", cwd_default("outputs/rag/output.json") @@ -324,7 +328,7 @@ def build_websearch_config() -> str: if use_rag: rag_path = _prompt( "Path to a RAG config YAML", - cwd_default("examples/rag/config.yaml"), + resolve_example("examples/rag/config.yaml"), ) llm_name = _prompt("LLM name", "OpenMeditron/meditron3-8b") max_new_tokens_raw = _prompt("Max new tokens", "1200") @@ -333,7 +337,7 @@ def build_websearch_config() -> str: except ValueError: max_new_tokens = 1200 input_queries = _prompt( - "Input queries JSONL", cwd_default("examples/rag/queries.jsonl") + "Input queries JSONL", resolve_example("examples/rag/queries.jsonl") ) output_file = _prompt( "Output JSON path", @@ -822,6 +826,7 @@ def pick_or_build_config( if choice == "manual": manual = _prompt("Path to YAML config") + manual = os.path.expandvars(os.path.expanduser(manual)) if not os.path.exists(manual): _show_error_panel(manual, "file not found") continue @@ -841,7 +846,7 @@ def pick_or_build_config( path = builder() assert path is not None - err = _validate_yaml(path, spec) + err = _validate_with_spinner(path, spec) if err is None: return _post_validation_menu(path, spec) _show_error_panel(path, err) diff --git a/src/mmore/tui/paths.py b/src/mmore/tui/paths.py index 17194f00..3c6233bf 100644 --- a/src/mmore/tui/paths.py +++ b/src/mmore/tui/paths.py @@ -1,10 +1,8 @@ -"""Locate bundled example configs regardless of CWD or install layout. +"""Locate bundled example configs regardless of CWD. Strategy: -- If `examples/` exists relative to CWD (source checkout), use it. -- Else, walk up from CWD looking for a repo root that contains `examples/`. -- Else, fall back to `importlib.resources` to read examples shipped with the - package (only available if the wheel actually bundles them). +- Walk up from CWD looking for a directory that contains ``examples/`` + (works from any subdirectory of a source checkout). - If nothing is found, return the original repo-relative path so error messages stay readable; callers handle "missing" gracefully. """ From bb808499a7fbd4b535737dd8eedd294a70170b86 Mon Sep 17 00:00:00 2001 From: Mathieu Date: Sun, 17 May 2026 15:00:56 +0200 Subject: [PATCH 10/24] fix(tui): expose merged_results_path helper in run_process + uv.lock --- src/mmore/run_process.py | 14 +++++++++++--- src/mmore/tui/config_builder.py | 4 +++- src/mmore/tui/pipeline.py | 5 ++--- uv.lock | 2 +- 4 files changed, 17 insertions(+), 8 deletions(-) diff --git a/src/mmore/run_process.py b/src/mmore/run_process.py index da53c62a..66484109 100644 --- a/src/mmore/run_process.py +++ b/src/mmore/run_process.py @@ -44,11 +44,19 @@ class ProcessInference: previous_results: Optional[str] = None +def merged_results_path(output_path: str) -> str: + """Path where `process` writes its final merged JSONL. + + Single source of truth for downstream tooling (TUI, scripts) that needs + to locate the JSONL produced by a `process` run from its config. + """ + return os.path.join(output_path, "merged", "merged_results.jsonl") + + def _write_merged_results(output_path, reused_samples, dispatched=True): """Merge per-processor JSONL files and reused samples into a single output.""" - merged_output_path = os.path.join(output_path, "merged") - output_file = os.path.join(merged_output_path, "merged_results.jsonl") - os.makedirs(merged_output_path, exist_ok=True) + output_file = merged_results_path(output_path) + os.makedirs(os.path.dirname(output_file), exist_ok=True) total_results = 0 with open(output_file, "w") as f: diff --git a/src/mmore/tui/config_builder.py b/src/mmore/tui/config_builder.py index 8f38a9b1..bd4ce7ee 100644 --- a/src/mmore/tui/config_builder.py +++ b/src/mmore/tui/config_builder.py @@ -481,8 +481,10 @@ def build_process_config_wizard() -> str: } # Incremental resume: detect previous results + from mmore.run_process import merged_results_path + previous_results = None - prev_path = os.path.join(output_path, "merged", "merged_results.jsonl") + prev_path = merged_results_path(output_path) if os.path.exists(prev_path) and _confirm( f"Previous results found at {prev_path}. Resume (skip unchanged files)?", default=True, diff --git a/src/mmore/tui/pipeline.py b/src/mmore/tui/pipeline.py index d79f83ce..65017ac0 100644 --- a/src/mmore/tui/pipeline.py +++ b/src/mmore/tui/pipeline.py @@ -31,12 +31,11 @@ def _process_output_jsonl(config_path: str) -> str: Goes through `mmore.utils.load_config` so env-var expansion ($ROOT_OUT_DIR, etc.) matches what the underlying command sees. """ - from mmore.run_process import ProcessInference + from mmore.run_process import ProcessInference, merged_results_path from mmore.utils import load_config cfg: ProcessInference = load_config(config_path, ProcessInference) - out = cfg.dispatcher_config.output_path - return os.path.join(out, "merged", "merged_results.jsonl") + return merged_results_path(cfg.dispatcher_config.output_path) def _postprocess_output_jsonl(config_path: str) -> str: diff --git a/uv.lock b/uv.lock index 933ebc23..94f0d1da 100644 --- a/uv.lock +++ b/uv.lock @@ -3621,7 +3621,7 @@ wheels = [ [[package]] name = "mmore" -version = "1.2.2" +version = "1.2.3" source = { editable = "." } dependencies = [ { name = "click" }, From 244a972085b94a556429a76e6bd021cdd7013af9 Mon Sep 17 00:00:00 2001 From: Mathieu Date: Mon, 18 May 2026 19:07:21 +0200 Subject: [PATCH 11/24] remove spinner + expose merged_results_path helper --- src/mmore/tui/app.py | 9 ++++----- src/mmore/tui/pipeline.py | 10 +++++----- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/src/mmore/tui/app.py b/src/mmore/tui/app.py index 4a5b33e0..1a758012 100644 --- a/src/mmore/tui/app.py +++ b/src/mmore/tui/app.py @@ -5,9 +5,7 @@ import time import questionary -from rich.live import Live from rich.panel import Panel -from rich.spinner import Spinner from rich.text import Text from mmore.tui.commands import REGISTRY, check_stage_available @@ -79,10 +77,11 @@ def _disabled_label(label: str) -> str: def _run_with_spinner(label: str, fn, **kwargs) -> None: + # See pipeline._run_step: heavy underlying commands log to stdout in ways + # that clash with a rich Live spinner. Plain prints keep output readable. start = time.time() - spinner = Spinner("dots", text=Text(f" {label}…", style=ACCENT)) - with Live(spinner, console=console, refresh_per_second=12, transient=True): - fn(**kwargs) + console.print(f" [{ACCENT}]▸[/] {label}…") + fn(**kwargs) console.print(f" [{OK}]✓[/] {label} [dim]({time.time() - start:.1f}s)[/dim]") diff --git a/src/mmore/tui/pipeline.py b/src/mmore/tui/pipeline.py index 65017ac0..2263ea0b 100644 --- a/src/mmore/tui/pipeline.py +++ b/src/mmore/tui/pipeline.py @@ -6,8 +6,6 @@ import time import questionary -from rich.live import Live -from rich.spinner import Spinner from rich.table import Table from rich.text import Text @@ -54,10 +52,12 @@ def _postprocess_output_jsonl(config_path: str) -> str: def _run_step(label: str, fn, **kwargs) -> float: + # No Live spinner here: run_process / run_index emit their own logs via + # `logging` and `click.echo`, which bypass rich.Console and clash with a + # refreshing spinner. Plain prints keep the output readable. start = time.time() - spinner = Spinner("dots", text=Text(f" {label}…", style=ACCENT)) - with Live(spinner, console=console, refresh_per_second=12, transient=True): - fn(**kwargs) + console.print(f" [{ACCENT}]▸[/] {label}…") + fn(**kwargs) elapsed = time.time() - start console.print(f" [{OK}]✓[/] {label} [dim]({elapsed:.1f}s)[/dim]") return elapsed From 072901be86fb6317528001ff30384c36a271a299 Mon Sep 17 00:00:00 2001 From: Mathieu Date: Mon, 18 May 2026 19:26:55 +0200 Subject: [PATCH 12/24] warm pipeline dataclasses + factor run_step helper + update .gitignore --- .gitignore | 1 + src/mmore/tui/app.py | 40 ++++++++++++++++++++++++++++----------- src/mmore/tui/pipeline.py | 29 +++++++--------------------- src/mmore/tui/theme.py | 18 ++++++++++++++++++ 4 files changed, 55 insertions(+), 33 deletions(-) diff --git a/.gitignore b/.gitignore index a490b5de..eaf88e7e 100644 --- a/.gitignore +++ b/.gitignore @@ -114,6 +114,7 @@ venv.bak/ # Milvus DB db/ *.db +*.db.lock # Project files tmp/ diff --git a/src/mmore/tui/app.py b/src/mmore/tui/app.py index 1a758012..69d13854 100644 --- a/src/mmore/tui/app.py +++ b/src/mmore/tui/app.py @@ -2,7 +2,7 @@ from __future__ import annotations -import time +import threading import questionary from rich.panel import Panel @@ -24,10 +24,35 @@ QMARK, QSTYLE, console, + run_step, section, show_banner, ) +_PIPELINE_STAGES = ("process", "postprocess", "index") + + +def _warm_pipeline_dataclasses() -> None: + """Pre-load process/postprocess/index dataclasses in a daemon thread. + + Called when entering the wizard or full-pipeline flows, where several YAML + validations happen back-to-back. The import cost then overlaps with the + wizard's own prompts. Daemon = no impact on exit. Stages whose canary + imports are missing are skipped so partial installs don't crash the warm-up. + """ + + def _warm() -> None: + for stage in _PIPELINE_STAGES: + spec = REGISTRY[stage] + if check_stage_available(spec) is not None or spec.config_dataclass is None: + continue + try: + spec.config_dataclass() + except Exception: # noqa: BLE001 + pass + + threading.Thread(target=_warm, daemon=True).start() + def _show_missing_extras(spec_name: str, hint: str) -> None: console.print( @@ -76,15 +101,6 @@ def _disabled_label(label: str) -> str: return f"⚠ {label}" -def _run_with_spinner(label: str, fn, **kwargs) -> None: - # See pipeline._run_step: heavy underlying commands log to stdout in ways - # that clash with a rich Live spinner. Plain prints keep output readable. - start = time.time() - console.print(f" [{ACCENT}]▸[/] {label}…") - fn(**kwargs) - console.print(f" [{OK}]✓[/] {label} [dim]({time.time() - start:.1f}s)[/dim]") - - def _run_single_command() -> None: choices = [] enabled_count = 0 @@ -149,7 +165,7 @@ def _run_single_command() -> None: if interactive: spec.run(**kwargs) else: - _run_with_spinner(spec.description, spec.run, **kwargs) + run_step(spec.description, spec.run, **kwargs) console.print(f"[{OK}]✓ {name} finished[/]") @@ -161,6 +177,7 @@ def _chat_only() -> None: def _run_full_wizard() -> None: + _warm_pipeline_dataclasses() paths = build_full_pipeline_wizard() console.print() console.print( @@ -261,6 +278,7 @@ def run() -> None: if mode == "single": _run_single_command() elif mode == "pipeline": + _warm_pipeline_dataclasses() run_full_pipeline() elif mode == "wizard": _run_full_wizard() diff --git a/src/mmore/tui/pipeline.py b/src/mmore/tui/pipeline.py index 2263ea0b..025692fb 100644 --- a/src/mmore/tui/pipeline.py +++ b/src/mmore/tui/pipeline.py @@ -2,9 +2,6 @@ from __future__ import annotations -import os -import time - import questionary from rich.table import Table from rich.text import Text @@ -16,8 +13,8 @@ ACCENT, ACCENT2, MUTED, - OK, console, + run_step, section, step_header, ) @@ -51,18 +48,6 @@ def _postprocess_output_jsonl(config_path: str) -> str: return jsonl_path(cfg.output.output_path) -def _run_step(label: str, fn, **kwargs) -> float: - # No Live spinner here: run_process / run_index emit their own logs via - # `logging` and `click.echo`, which bypass rich.Console and clash with a - # refreshing spinner. Plain prints keep the output readable. - start = time.time() - console.print(f" [{ACCENT}]▸[/] {label}…") - fn(**kwargs) - elapsed = time.time() - start - console.print(f" [{OK}]✓[/] {label} [dim]({elapsed:.1f}s)[/dim]") - return elapsed - - def _summary_table(rows: list[tuple[str, str, float]]) -> Table: table = Table( title="[bold]Pipeline summary[/bold]", @@ -97,7 +82,7 @@ def run_pipeline_with_configs(process_cfg: str, pp_cfg: str, index_cfg: str) -> rows: list[tuple[str, str, float]] = [] step_header(1, 3, "process") - elapsed = _run_step( + elapsed = run_step( "Crawling + extracting documents", REGISTRY["process"].run, config_file=process_cfg, @@ -107,7 +92,7 @@ def run_pipeline_with_configs(process_cfg: str, pp_cfg: str, index_cfg: str) -> inspect_jsonl(process_jsonl) step_header(2, 3, "postprocess") - elapsed = _run_step( + elapsed = run_step( "Chunking + cleaning", REGISTRY["postprocess"].run, config_file=pp_cfg, @@ -118,7 +103,7 @@ def run_pipeline_with_configs(process_cfg: str, pp_cfg: str, index_cfg: str) -> inspect_jsonl(pp_jsonl) step_header(3, 3, "index") - elapsed = _run_step( + elapsed = run_step( "Embedding + indexing into Milvus", REGISTRY["index"].run, config_file=index_cfg, @@ -149,7 +134,7 @@ def run_full_pipeline() -> None: step_header(1, 3, "process") process_cfg = pick_or_build_config(REGISTRY["process"]) - elapsed = _run_step( + elapsed = run_step( "Crawling + extracting documents", REGISTRY["process"].run, config_file=process_cfg, @@ -160,7 +145,7 @@ def run_full_pipeline() -> None: step_header(2, 3, "postprocess") pp_cfg = pick_or_build_config(REGISTRY["postprocess"]) - elapsed = _run_step( + elapsed = run_step( "Chunking + cleaning", REGISTRY["postprocess"].run, config_file=pp_cfg, @@ -172,7 +157,7 @@ def run_full_pipeline() -> None: step_header(3, 3, "index") index_cfg = pick_or_build_config(REGISTRY["index"], documents_path=pp_jsonl) - elapsed = _run_step( + elapsed = run_step( "Embedding + indexing into Milvus", REGISTRY["index"].run, config_file=index_cfg, diff --git a/src/mmore/tui/theme.py b/src/mmore/tui/theme.py index b3710278..cfae578f 100644 --- a/src/mmore/tui/theme.py +++ b/src/mmore/tui/theme.py @@ -2,6 +2,9 @@ from __future__ import annotations +import time +from typing import Any, Callable + from questionary import Style from rich.align import Align from rich.console import Console, Group @@ -77,6 +80,21 @@ def section(title: str, body: str | Text, style: str = ACCENT) -> Panel: ) +def run_step(label: str, fn: Callable[..., Any], **kwargs: Any) -> float: + """Print a start line, call fn(**kwargs), print a timed done line. + + Heavy pipeline commands emit their own logs via logging/click which bypass + rich.Console — a Live spinner would clash with them. Plain prints keep the + output readable while still showing progress. + """ + start = time.time() + console.print(f" [{ACCENT}]▸[/] {label}…") + fn(**kwargs) + elapsed = time.time() - start + console.print(f" [{OK}]✓[/] {label} [dim]({elapsed:.1f}s)[/dim]") + return elapsed + + def step_header(idx: int, total: int, name: str) -> None: bar = "─" * 4 console.print() From d665ee9720cdd11471cdba2d5c1cbbc63914a965 Mon Sep 17 00:00:00 2001 From: Arthur PERRIN Date: Tue, 19 May 2026 13:35:00 +0200 Subject: [PATCH 13/24] feat(tui): add setup wizard for deps install and .env generation Add a guided setup flow accessible from the main menu that walks the user through picking pipeline stages, selecting a compute backend (cpu / cu126), running `uv sync` with the right extras, and generating a .env file with the API keys / paths each stage needs. Existing .env entries are preserved on merge, and secret values are masked in the preview table. --- src/mmore/tui/app.py | 2 + src/mmore/tui/setup.py | 307 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 309 insertions(+) create mode 100644 src/mmore/tui/setup.py diff --git a/src/mmore/tui/app.py b/src/mmore/tui/app.py index 69d13854..d5e54e99 100644 --- a/src/mmore/tui/app.py +++ b/src/mmore/tui/app.py @@ -16,6 +16,7 @@ from mmore.tui.exceptions import UserCancelledError from mmore.tui.paths import cwd_default from mmore.tui.pipeline import run_full_pipeline, run_pipeline_with_configs +from mmore.tui.setup import run_setup_wizard from mmore.tui.theme import ( ACCENT, ACCENT2, @@ -252,6 +253,7 @@ def _main_menu() -> str | None: wizard_choice, chat_choice, questionary.Separator(), + questionary.Choice("🔧 Setup (install deps + generate .env)", value="setup"), questionary.Choice("✕ Quit", value="quit"), ], style=QSTYLE, diff --git a/src/mmore/tui/setup.py b/src/mmore/tui/setup.py new file mode 100644 index 00000000..45ef2931 --- /dev/null +++ b/src/mmore/tui/setup.py @@ -0,0 +1,307 @@ +"""Setup wizard: install extras + generate .env in one guided flow.""" + +from __future__ import annotations + +import os +import subprocess +import sys +from pathlib import Path +from typing import Any + +import questionary +from rich.panel import Panel +from rich.syntax import Syntax +from rich.table import Table +from rich.text import Text + +from mmore.tui.commands import REGISTRY, check_stage_available +from mmore.tui.config_builder import _ask, _confirm, _prompt +from mmore.tui.theme import ACCENT, ACCENT2, MUTED, OK, QMARK, QSTYLE, console + +# --------------------------------------------------------------------------- +# Stage → extras mapping +# --------------------------------------------------------------------------- + +_STAGE_EXTRAS: dict[str, list[str]] = { + "process": ["process"], + "postprocess": ["process"], + "index": ["index"], + "rag": ["rag"], + "ragcli": ["rag"], + "retrieve": ["rag", "api"], + "websearch": ["websearch"], +} + +_COMPUTE_EXTRAS = [ + ("cpu", "CPU-only (no CUDA)"), + ("cu126", "CUDA 12.6 (GPU)"), +] + +# --------------------------------------------------------------------------- +# Stage → env vars that may be needed +# --------------------------------------------------------------------------- + +_STAGE_ENV_VARS: dict[str, list[tuple[str, str, str]]] = { + # (var_name, description, default_or_empty) + "process": [ + ("ROOT_OUT_DIR", "Root output directory for processed results", ""), + ("ROOT_IN_DIR", "Root input directory for source documents", ""), + ], + "rag": [ + ("OPENAI_API_KEY", "OpenAI API key (for GPT models)", ""), + ("ANTHROPIC_API_KEY", "Anthropic API key (for Claude models)", ""), + ("MISTRAL_API_KEY", "Mistral API key", ""), + ("COHERE_API_KEY", "Cohere API key", ""), + ("HF_TOKEN", "HuggingFace token (for gated models)", ""), + ], + "websearch": [ + ("TAVILY_API_KEY", "Tavily API key (optional, DuckDuckGo used otherwise)", ""), + ], +} + +# Aliases: ragcli and retrieve share rag's env vars +_STAGE_ENV_VARS["ragcli"] = _STAGE_ENV_VARS["rag"] +_STAGE_ENV_VARS["retrieve"] = _STAGE_ENV_VARS["rag"] + +# Profiling env vars (always available) +_PROFILING_VARS: list[tuple[str, str, str]] = [ + ("MMORE_PROFILING_ENABLED", "Enable profiling", "false"), + ("MMORE_PROFILING_OUTPUT_DIR", "Profiling output directory", "./profiling_output"), +] + + +def _detect_installed_stages() -> dict[str, bool]: + """Check which stages have their deps installed.""" + return {name: check_stage_available(spec) is None for name, spec in REGISTRY.items()} + + +def _pick_stages() -> list[str]: + """Ask the user which pipeline stages they want to use.""" + installed = _detect_installed_stages() + choices = [] + for name, spec in REGISTRY.items(): + label = f"{name:<12} — {spec.description}" + if installed[name]: + label += " [dim](installed)[/dim]" + choices.append(questionary.Choice(label, value=name, checked=not installed[name])) + + selected = _ask( + questionary.checkbox( + "Which stages do you want to set up?", + choices=choices, + style=QSTYLE, + qmark=QMARK, + ) + ) + return selected + + +def _pick_compute() -> str: + """Ask the user which compute backend to use.""" + choices = [ + questionary.Choice(f"{name:<6} — {desc}", value=name) + for name, desc in _COMPUTE_EXTRAS + ] + return _ask( + questionary.select( + "Compute backend", + choices=choices, + style=QSTYLE, + qmark=QMARK, + ) + ) + + +def _build_uv_command(stages: list[str], compute: str) -> list[str]: + """Build the uv sync command from selected stages + compute.""" + extras: set[str] = {"tui"} # always include TUI + for stage in stages: + extras.update(_STAGE_EXTRAS.get(stage, [])) + extras.add(compute) + + cmd = [sys.executable, "-m", "uv", "sync"] + for extra in sorted(extras): + cmd.extend(["--extra", extra]) + return cmd + + +def _install_deps(stages: list[str], compute: str) -> bool: + """Run uv sync with the right extras. Returns True on success.""" + cmd = _build_uv_command(stages, compute) + display_cmd = " ".join(cmd[2:]) # skip python -m prefix for display + console.print(f"\n [bold]Running:[/] {display_cmd}\n") + + result = subprocess.run(cmd, cwd=os.getcwd()) + if result.returncode == 0: + console.print(f" [{OK}]✓[/] Dependencies installed successfully") + return True + console.print(" [bold red]✗[/] Installation failed — check output above") + return False + + +def _collect_env_vars(stages: list[str]) -> dict[str, str]: + """Prompt the user for env vars needed by their selected stages.""" + seen: set[str] = set() + env_vars: dict[str, str] = {} + + # Gather all relevant vars (deduplicated) + all_vars: list[tuple[str, str, str]] = [] + for stage in stages: + for var in _STAGE_ENV_VARS.get(stage, []): + if var[0] not in seen: + seen.add(var[0]) + all_vars.append(var) + + if not all_vars: + return env_vars + + console.print( + Panel( + "Set environment variables for your selected stages.\n" + "Leave blank to skip — you can always edit the .env file later.", + title="[bold]Environment variables[/bold]", + border_style=ACCENT, + padding=(1, 2), + ) + ) + + for var_name, description, default in all_vars: + # Check if already set in environment + current = os.environ.get(var_name, "") + hint = f" [dim](current: {current[:20]}…)[/dim]" if current else "" + value = _prompt(f"{var_name} — {description}{hint}", default=current or default) + if value: + env_vars[var_name] = value + + # Optionally add profiling vars + if _confirm("Configure profiling settings?", default=False): + for var_name, description, default in _PROFILING_VARS: + value = _prompt(f"{var_name} — {description}", default=default) + if value: + env_vars[var_name] = value + + return env_vars + + +def _write_dotenv(env_vars: dict[str, str], path: str = ".env") -> str: + """Write or merge env vars into a .env file. + + Existing variables in the file are preserved; new ones are appended. + """ + existing: dict[str, str] = {} + lines: list[str] = [] + env_path = Path(path) + + if env_path.exists(): + raw = env_path.read_text() + for line in raw.splitlines(): + stripped = line.strip() + if stripped and not stripped.startswith("#") and "=" in stripped: + key = stripped.split("=", 1)[0].strip() + existing[key] = line + lines.append(line) + + # Append new vars + added = [] + for key, value in env_vars.items(): + if key in existing: + continue # don't overwrite existing + # Quote values that contain spaces + if " " in value: + entry = f'{key}="{value}"' + else: + entry = f"{key}={value}" + lines.append(entry) + added.append(key) + + if lines and not lines[-1].endswith("\n"): + content = "\n".join(lines) + "\n" + else: + content = "\n".join(lines) + + env_path.write_text(content) + return str(env_path) + + +def _preview_dotenv(env_vars: dict[str, str]) -> None: + """Show what will be written to .env.""" + if not env_vars: + console.print(" [dim]No environment variables to write.[/dim]") + return + + table = Table( + title="[bold].env preview[/bold]", + title_style=ACCENT2, + border_style=ACCENT, + show_lines=False, + ) + table.add_column("Variable", style="bold") + table.add_column("Value", style=MUTED) + + for key, value in env_vars.items(): + # Mask API keys + if "KEY" in key or "TOKEN" in key: + display = value[:4] + "…" + value[-4:] if len(value) > 8 else "****" + else: + display = value + table.add_row(key, display) + + console.print(table) + + +def run_setup_wizard() -> None: + """Full setup wizard: pick stages → install deps → generate .env.""" + console.print( + Panel( + Text( + "This wizard will:\n" + " 1. Install the right Python dependencies for your pipeline\n" + " 2. Generate a .env file with the required environment variables", + ), + title="[bold]Setup wizard[/bold]", + border_style=ACCENT2, + padding=(1, 2), + ) + ) + + # Step 1: pick stages + stages = _pick_stages() + if not stages: + console.print(" [dim]No stages selected — nothing to do.[/dim]") + return + + # Step 2: pick compute backend + compute = _pick_compute() + + # Step 3: show install command and confirm + cmd = _build_uv_command(stages, compute) + display_cmd = " ".join(cmd[2:]) + console.print( + Panel( + Text(display_cmd), + title="[bold]Install command[/bold]", + border_style=ACCENT, + padding=(0, 2), + ) + ) + if _confirm("Install dependencies now?", default=True): + if not _install_deps(stages, compute): + if not _confirm("Continue to .env setup despite install failure?", default=False): + return + + # Step 4: collect env vars + env_vars = _collect_env_vars(stages) + + # Step 5: preview and write .env + if env_vars: + _preview_dotenv(env_vars) + env_path = _prompt(".env file path", default=".env") + if _confirm(f"Write {len(env_vars)} variable(s) to {env_path}?", default=True): + written = _write_dotenv(env_vars, env_path) + console.print(f" [{OK}]✓[/] Saved to {written}") + else: + console.print(" [dim]Skipped .env generation.[/dim]") + else: + console.print(" [dim]No environment variables needed for selected stages.[/dim]") + + console.print(f"\n [{OK}]✓ Setup complete![/] Run [bold]mmore tui[/bold] to start.\n") From 47b8b12198200226e31422b185a7a1321a6e4002 Mon Sep 17 00:00:00 2001 From: Arthur PERRIN Date: Tue, 19 May 2026 14:40:57 +0200 Subject: [PATCH 14/24] style(tui): match GitHub logo colors in banner MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Outline characters in white, filled blocks in pure black, second M in yellow — matches the m(m)ore logo. Uses hex colors to avoid terminal themes remapping ANSI black to dark grey. --- src/mmore/tui/theme.py | 40 ++++++++++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/src/mmore/tui/theme.py b/src/mmore/tui/theme.py index cfae578f..4f29b727 100644 --- a/src/mmore/tui/theme.py +++ b/src/mmore/tui/theme.py @@ -46,20 +46,44 @@ """ -def _gradient(text: str, start: str = "bright_cyan", end: str = "magenta") -> Text: - """Cheap two-color gradient — top half ACCENT, bottom half ACCENT2.""" - lines = text.splitlines() - half = max(1, len(lines) // 2) +def _mmore_logo(text: str) -> Text: + """Color the banner like the mmore GitHub logo. + + Strategy, per character: + - The second `M` (columns 12:23 of every row) is rendered fully in yellow. + - Elsewhere: outline characters (`╔╗╚╝═║╔╝╗`, etc.) are white and the + filled `█` blocks are black, giving the letters a hollow look. + """ + OUTLINE = set("╔╗╚╝═║╠╣╦╩╬╔╝╗┌┐└┘─│") out = Text() - for i, line in enumerate(lines): - style = start if i < half else end - out.append(line + "\n", style=style) + for line in text.splitlines(): + if not line.strip(): + out.append(line + "\n") + continue + left = line[:12] + mid = line[12:23] + right = line[23:] + + def _emit(segment: str) -> None: + for ch in segment: + if ch == "█": + # explicit hex — terminal "black" often renders as dark grey + out.append(ch, style="#000000") + elif ch in OUTLINE: + out.append(ch, style="bold #ffffff") + else: + out.append(ch) + + _emit(left) + out.append(mid, style="bold yellow") + _emit(right) + out.append("\n") return out def show_banner(subtitle: str = "interactive launcher") -> None: body = Group( - _gradient(BANNER), + _mmore_logo(BANNER), Align.center(Text(subtitle, style=f"italic {MUTED}")), ) console.print( From ca5479d42d158d66b0c6dffc7ec901df9c667ea0 Mon Sep 17 00:00:00 2001 From: Arthur PERRIN Date: Tue, 19 May 2026 14:46:22 +0200 Subject: [PATCH 15/24] fix(tui): ruff lint and format --- src/mmore/tui/app.py | 5 +++-- src/mmore/tui/setup.py | 22 +++++++++++++++------- src/mmore/tui/theme.py | 4 ++-- 3 files changed, 20 insertions(+), 11 deletions(-) diff --git a/src/mmore/tui/app.py b/src/mmore/tui/app.py index d5e54e99..4f0f286c 100644 --- a/src/mmore/tui/app.py +++ b/src/mmore/tui/app.py @@ -16,7 +16,6 @@ from mmore.tui.exceptions import UserCancelledError from mmore.tui.paths import cwd_default from mmore.tui.pipeline import run_full_pipeline, run_pipeline_with_configs -from mmore.tui.setup import run_setup_wizard from mmore.tui.theme import ( ACCENT, ACCENT2, @@ -253,7 +252,9 @@ def _main_menu() -> str | None: wizard_choice, chat_choice, questionary.Separator(), - questionary.Choice("🔧 Setup (install deps + generate .env)", value="setup"), + questionary.Choice( + "🔧 Setup (install deps + generate .env)", value="setup" + ), questionary.Choice("✕ Quit", value="quit"), ], style=QSTYLE, diff --git a/src/mmore/tui/setup.py b/src/mmore/tui/setup.py index 45ef2931..cffeb224 100644 --- a/src/mmore/tui/setup.py +++ b/src/mmore/tui/setup.py @@ -6,11 +6,9 @@ import subprocess import sys from pathlib import Path -from typing import Any import questionary from rich.panel import Panel -from rich.syntax import Syntax from rich.table import Table from rich.text import Text @@ -72,7 +70,9 @@ def _detect_installed_stages() -> dict[str, bool]: """Check which stages have their deps installed.""" - return {name: check_stage_available(spec) is None for name, spec in REGISTRY.items()} + return { + name: check_stage_available(spec) is None for name, spec in REGISTRY.items() + } def _pick_stages() -> list[str]: @@ -83,7 +83,9 @@ def _pick_stages() -> list[str]: label = f"{name:<12} — {spec.description}" if installed[name]: label += " [dim](installed)[/dim]" - choices.append(questionary.Choice(label, value=name, checked=not installed[name])) + choices.append( + questionary.Choice(label, value=name, checked=not installed[name]) + ) selected = _ask( questionary.checkbox( @@ -286,7 +288,9 @@ def run_setup_wizard() -> None: ) if _confirm("Install dependencies now?", default=True): if not _install_deps(stages, compute): - if not _confirm("Continue to .env setup despite install failure?", default=False): + if not _confirm( + "Continue to .env setup despite install failure?", default=False + ): return # Step 4: collect env vars @@ -302,6 +306,10 @@ def run_setup_wizard() -> None: else: console.print(" [dim]Skipped .env generation.[/dim]") else: - console.print(" [dim]No environment variables needed for selected stages.[/dim]") + console.print( + " [dim]No environment variables needed for selected stages.[/dim]" + ) - console.print(f"\n [{OK}]✓ Setup complete![/] Run [bold]mmore tui[/bold] to start.\n") + console.print( + f"\n [{OK}]✓ Setup complete![/] Run [bold]mmore tui[/bold] to start.\n" + ) diff --git a/src/mmore/tui/theme.py b/src/mmore/tui/theme.py index 4f29b727..2df57d15 100644 --- a/src/mmore/tui/theme.py +++ b/src/mmore/tui/theme.py @@ -54,7 +54,7 @@ def _mmore_logo(text: str) -> Text: - Elsewhere: outline characters (`╔╗╚╝═║╔╝╗`, etc.) are white and the filled `█` blocks are black, giving the letters a hollow look. """ - OUTLINE = set("╔╗╚╝═║╠╣╦╩╬╔╝╗┌┐└┘─│") + outline_chars = set("╔╗╚╝═║╠╣╦╩╬╔╝╗┌┐└┘─│") out = Text() for line in text.splitlines(): if not line.strip(): @@ -69,7 +69,7 @@ def _emit(segment: str) -> None: if ch == "█": # explicit hex — terminal "black" often renders as dark grey out.append(ch, style="#000000") - elif ch in OUTLINE: + elif ch in outline_chars: out.append(ch, style="bold #ffffff") else: out.append(ch) From a7fe73fadd4d2012d3486806c73d5e463677bfd5 Mon Sep 17 00:00:00 2001 From: Mathieu Date: Fri, 22 May 2026 14:25:53 +0200 Subject: [PATCH 16/24] center banner, centralize _ask, add int/float prompts --- src/mmore/tui/config_builder.py | 208 +++++++++++++++----------------- src/mmore/tui/paths.py | 2 - src/mmore/tui/theme.py | 2 +- 3 files changed, 95 insertions(+), 117 deletions(-) diff --git a/src/mmore/tui/config_builder.py b/src/mmore/tui/config_builder.py index bd4ce7ee..6c54c6a0 100644 --- a/src/mmore/tui/config_builder.py +++ b/src/mmore/tui/config_builder.py @@ -57,6 +57,20 @@ def _confirm(question: str, default: bool = False) -> bool: ) +def _prompt_int(question: str, default: int) -> int: + try: + return int(_prompt(question, str(default))) + except ValueError: + return default + + +def _prompt_float(question: str, default: float) -> float: + try: + return float(_prompt(question, str(default))) + except ValueError: + return default + + def _save(name: str, data: dict[str, Any]) -> str: CONFIG_DIR.mkdir(parents=True, exist_ok=True) path = CONFIG_DIR / f"{name}-{time.time_ns()}.yaml" @@ -181,24 +195,24 @@ def build_process_config() -> str: def build_postprocess_config() -> str: - strategy = questionary.select( - "Chunking strategy", - choices=["sentence", "token", "word", "semantic"], - default="sentence", - style=QSTYLE, - qmark=QMARK, - ).ask() - if strategy is None: - raise UserCancelledError("cancelled") - table_handling = questionary.select( - "Table handling", - choices=["single_row", "multi_rows", "keep_whole", "none"], - default="single_row", - style=QSTYLE, - qmark=QMARK, - ).ask() - if table_handling is None: - raise UserCancelledError("cancelled") + strategy = _ask( + questionary.select( + "Chunking strategy", + choices=["sentence", "token", "word", "semantic"], + default="sentence", + style=QSTYLE, + qmark=QMARK, + ) + ) + table_handling = _ask( + questionary.select( + "Table handling", + choices=["single_row", "multi_rows", "keep_whole", "none"], + default="single_row", + style=QSTYLE, + qmark=QMARK, + ) + ) output_path = _prompt( "Output JSONL path", cwd_default("outputs/postprocess/results.jsonl"), @@ -247,39 +261,27 @@ def build_index_config(documents_path: Optional[str] = None) -> str: def build_rag_config() -> str: """Wizard for `rag` / `retrieve` / `ragcli` configs.""" llm_name = _prompt("LLM name", "OpenMeditron/meditron3-8b") - max_new_tokens_raw = _prompt("Max new tokens", "1200") - try: - max_new_tokens = int(max_new_tokens_raw) - except ValueError: - max_new_tokens = 1200 + max_new_tokens = _prompt_int("Max new tokens", 1200) db_uri = _prompt( "DB URI (Milvus Lite file or server URL)", cwd_default("proc_demo.db") ) db_name = _prompt("DB name", "my_db") collection = _prompt("Collection name", "my_docs") - k_raw = _prompt("Number of docs to retrieve (k)", "5") - try: - k = int(k_raw) - except ValueError: - k = 5 - hybrid_raw = _prompt("Hybrid search weight (0.0 dense — 1.0 sparse)", "0.5") - try: - hybrid = float(hybrid_raw) - except ValueError: - hybrid = 0.5 + k = _prompt_int("Number of docs to retrieve (k)", 5) + hybrid = _prompt_float("Hybrid search weight (0.0 dense — 1.0 sparse)", 0.5) use_web = _confirm("Augment retrieval with web search?", default=False) reranker = _prompt("Reranker model (blank to skip)", "BAAI/bge-reranker-base") - mode = questionary.select( - "Run mode", - choices=["local", "api"], - default="local", - style=QSTYLE, - qmark=QMARK, - ).ask() - if mode is None: - raise UserCancelledError("cancelled") + mode = _ask( + questionary.select( + "Run mode", + choices=["local", "api"], + default="local", + style=QSTYLE, + qmark=QMARK, + ) + ) cfg: dict[str, Any] = { "rag": { @@ -308,11 +310,7 @@ def build_rag_config() -> str: ) cfg["mode_args"] = {"input_file": input_file, "output_file": output_file} else: - port_raw = _prompt("API port", "8000") - try: - port = int(port_raw) - except ValueError: - port = 8000 + port = _prompt_int("API port", 8000) cfg["mode_args"] = { "endpoint": "/rag", "host": "0.0.0.0", @@ -331,11 +329,7 @@ def build_websearch_config() -> str: resolve_example("examples/rag/config.yaml"), ) llm_name = _prompt("LLM name", "OpenMeditron/meditron3-8b") - max_new_tokens_raw = _prompt("Max new tokens", "1200") - try: - max_new_tokens = int(max_new_tokens_raw) - except ValueError: - max_new_tokens = 1200 + max_new_tokens = _prompt_int("Max new tokens", 1200) input_queries = _prompt( "Input queries JSONL", resolve_example("examples/rag/queries.jsonl") ) @@ -343,25 +337,17 @@ def build_websearch_config() -> str: "Output JSON path", cwd_default("outputs/websearch/enhanced_results.json"), ) - n_subqueries_raw = _prompt("Number of sub-queries per question", "2") - try: - n_subqueries = int(n_subqueries_raw) - except ValueError: - n_subqueries = 2 - max_searches_raw = _prompt("Max searches per query", "5") - try: - max_searches = int(max_searches_raw) - except ValueError: - max_searches = 5 - provider = questionary.select( - "Search provider", - choices=["duckduckgo"], - default="duckduckgo", - style=QSTYLE, - qmark=QMARK, - ).ask() - if provider is None: - raise UserCancelledError("cancelled") + n_subqueries = _prompt_int("Number of sub-queries per question", 2) + max_searches = _prompt_int("Max searches per query", 5) + provider = _ask( + questionary.select( + "Search provider", + choices=["duckduckgo"], + default="duckduckgo", + style=QSTYLE, + qmark=QMARK, + ) + ) cfg: dict[str, Any] = { "websearch": { @@ -450,14 +436,14 @@ def build_process_config_wizard() -> str: extract_images = _confirm("Extract images from documents?", default=True) names = [n for n, _ in _ALL_PROCESSORS] - selected = questionary.checkbox( - "Select processors to enable", - choices=[questionary.Choice(n, value=n, checked=True) for n in names], - style=QSTYLE, - qmark=QMARK, - ).ask() - if selected is None: - raise UserCancelledError("cancelled") + selected = _ask( + questionary.checkbox( + "Select processors to enable", + choices=[questionary.Choice(n, value=n, checked=True) for n in names], + style=QSTYLE, + qmark=QMARK, + ) + ) if not selected: selected = names # empty would mean a no-op pipeline; fall back to all @@ -466,14 +452,7 @@ def build_process_config_wizard() -> str: for name, default in _ALL_PROCESSORS: if name not in selected: continue - if customize: - raw = _prompt(f"Batch size for {name}", str(default)) - try: - value = int(raw) - except ValueError: - value = default - else: - value = default + value = _prompt_int(f"Batch size for {name}", default) if customize else default sizes.append({name: value}) processor_config = { @@ -526,24 +505,24 @@ def _postprocessor_choices() -> list[str]: def _ask_module_args(pp_type: str) -> dict[str, Any]: if pp_type == "chunker": - strategy = questionary.select( - "Chunking strategy", - choices=["sentence", "token", "word", "semantic"], - default="sentence", - style=QSTYLE, - qmark=QMARK, - ).ask() - if strategy is None: - raise UserCancelledError("cancelled") - table_handling = questionary.select( - "Table handling", - choices=["single_row", "multi_rows", "keep_whole", "none"], - default="single_row", - style=QSTYLE, - qmark=QMARK, - ).ask() - if table_handling is None: - raise UserCancelledError("cancelled") + strategy = _ask( + questionary.select( + "Chunking strategy", + choices=["sentence", "token", "word", "semantic"], + default="sentence", + style=QSTYLE, + qmark=QMARK, + ) + ) + table_handling = _ask( + questionary.select( + "Table handling", + choices=["single_row", "multi_rows", "keep_whole", "none"], + default="single_row", + style=QSTYLE, + qmark=QMARK, + ) + ) return { "chunking_strategy": strategy, "table_handling": table_handling, @@ -570,14 +549,14 @@ def build_postprocess_config_wizard() -> str: console.print( f" [dim]current modules:[/] {', '.join(m['type'] for m in modules)}" ) - pp_type = questionary.select( - "Add a post-processor module" if not modules else "Add another module", - choices=[*available, questionary.Separator(), "(done)"], - style=QSTYLE, - qmark=QMARK, - ).ask() - if pp_type is None: - raise UserCancelledError("cancelled") + pp_type = _ask( + questionary.select( + "Add a post-processor module" if not modules else "Add another module", + choices=[*available, questionary.Separator(), "(done)"], + style=QSTYLE, + qmark=QMARK, + ) + ) if pp_type == "(done)": break args = _ask_module_args(pp_type) @@ -847,7 +826,8 @@ def pick_or_build_config( else: path = builder() - assert path is not None + if path is None: + raise UserCancelledError("no config selected") err = _validate_with_spinner(path, spec) if err is None: return _post_validation_menu(path, spec) diff --git a/src/mmore/tui/paths.py b/src/mmore/tui/paths.py index 3c6233bf..cb2594b6 100644 --- a/src/mmore/tui/paths.py +++ b/src/mmore/tui/paths.py @@ -10,12 +10,10 @@ from __future__ import annotations import os -from functools import lru_cache from pathlib import Path from typing import Optional -@lru_cache(maxsize=1) def repo_root() -> Optional[Path]: """Return a directory that contains an `examples/` folder, if any.""" cwd = Path.cwd() diff --git a/src/mmore/tui/theme.py b/src/mmore/tui/theme.py index 2df57d15..4a7aeb6d 100644 --- a/src/mmore/tui/theme.py +++ b/src/mmore/tui/theme.py @@ -83,7 +83,7 @@ def _emit(segment: str) -> None: def show_banner(subtitle: str = "interactive launcher") -> None: body = Group( - _mmore_logo(BANNER), + Align.center(_mmore_logo(BANNER)), Align.center(Text(subtitle, style=f"italic {MUTED}")), ) console.print( From c7f67c453ba4320c991e310053c34ed9db722181 Mon Sep 17 00:00:00 2001 From: fabnemEPFL <117652591+fabnemEPFL@users.noreply.github.com> Date: Tue, 12 May 2026 11:22:16 +0200 Subject: [PATCH 17/24] Update paper link from OpenReview to arXiv --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 02a1b4c4..85d9f84b 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ MMORE is an open-source, end-to-end pipeline to ingest, process, index, and retrieve knowledge from heterogeneous files: PDFs, Office docs, spreadsheets, emails, images, audio, video, and web pages. It standardizes content into a unified multimodal format, supports distributed CPU/GPU processing, and provides hybrid dense+sparse retrieval with an integrated RAG service (CLI, APIs). -👉 Read the paper for more details (OpenReview): [MMORE: Massive Multimodal Open RAG & Extraction](https://openreview.net/forum?id=6j1HjfIdKn) +👉 Read the paper for more details (arXiv): [MMORE: Massive Multimodal Open RAG & Extraction](https://arxiv.org/abs/2509.11937) ### Documentation From d3fd9650292e4f87aaed45908077c094b9b8eb56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9my=20Chaverot?= Date: Wed, 13 May 2026 17:02:27 +0200 Subject: [PATCH 18/24] Fix tests not passing in CI (#304) --- pyproject.toml | 3 ++- uv.lock | 19 +++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 843fff9b..414ef7d8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -87,7 +87,8 @@ process = [ index = [ # Indexing + Retrieval (vector DB, embeddings) - "pymilvus[milvus-lite]==2.6.6", + "pymilvus==2.6.6", + "milvus-lite==2.5.1", "pymilvus-model>=0.3.2", "milvus-model>=0.2.12", "langchain-milvus>=0.1.8", diff --git a/uv.lock b/uv.lock index 94f0d1da..8f887fea 100644 --- a/uv.lock +++ b/uv.lock @@ -3575,7 +3575,7 @@ name = "milvus-lite" version = "2.5.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "tqdm", marker = "python_full_version < '3.11' or sys_platform != 'win32' or (extra == 'extra-5-mmore-cpu' and extra == 'extra-5-mmore-cu126')" }, + { name = "tqdm" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/a9/b2/acc5024c8e8b6a0b034670b8e8af306ebd633ede777dcbf557eac4785937/milvus_lite-2.5.1-py3-none-macosx_10_9_x86_64.whl", hash = "sha256:6b014453200ba977be37ba660cb2d021030375fa6a35bc53c2e1d92980a0c512", size = 27934713, upload-time = "2025-06-30T04:23:37.028Z" }, @@ -3676,13 +3676,14 @@ all = [ { name = "markdown" }, { name = "markdownify" }, { name = "marker-pdf" }, + { name = "milvus-lite" }, { name = "milvus-model" }, { name = "motor" }, { name = "moviepy" }, { name = "nltk" }, { name = "openpyxl" }, { name = "py7zr" }, - { name = "pymilvus", extra = ["milvus-lite"] }, + { name = "pymilvus" }, { name = "pymilvus-model" }, { name = "pymongo" }, { name = "pymupdf" }, @@ -3734,8 +3735,9 @@ dev = [ ] index = [ { name = "langchain-milvus" }, + { name = "milvus-lite" }, { name = "milvus-model" }, - { name = "pymilvus", extra = ["milvus-lite"] }, + { name = "pymilvus" }, { name = "pymilvus-model" }, { name = "scipy", version = "1.15.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-5-mmore-cpu' and extra == 'extra-5-mmore-cu126')" }, { name = "scipy", version = "1.17.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-5-mmore-cpu' and extra == 'extra-5-mmore-cu126')" }, @@ -3790,9 +3792,10 @@ rag = [ { name = "langchain-milvus" }, { name = "langchain-mistralai" }, { name = "langchain-openai" }, + { name = "milvus-lite" }, { name = "milvus-model" }, { name = "nltk" }, - { name = "pymilvus", extra = ["milvus-lite"] }, + { name = "pymilvus" }, { name = "pymilvus-model" }, { name = "ragas" }, { name = "scipy", version = "1.15.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-5-mmore-cpu' and extra == 'extra-5-mmore-cu126')" }, @@ -3848,6 +3851,7 @@ requires-dist = [ { name = "markdown", marker = "extra == 'process'", specifier = ">=3.5" }, { name = "markdownify", marker = "extra == 'process'", specifier = ">=0.12" }, { name = "marker-pdf", marker = "extra == 'process'", specifier = ">=1.6" }, + { name = "milvus-lite", marker = "extra == 'index'", specifier = "==2.5.1" }, { name = "milvus-model", marker = "extra == 'index'", specifier = ">=0.2.12" }, { name = "mmore", extras = ["index"], marker = "extra == 'rag'" }, { name = "mmore", extras = ["process", "rag", "api", "websearch", "tui"], marker = "extra == 'all'" }, @@ -3861,7 +3865,7 @@ requires-dist = [ { name = "pillow" }, { name = "py7zr", marker = "extra == 'process'", specifier = ">=0.22" }, { name = "pydantic", specifier = ">=2.6" }, - { name = "pymilvus", extras = ["milvus-lite"], marker = "extra == 'index'", specifier = "==2.6.6" }, + { name = "pymilvus", marker = "extra == 'index'", specifier = "==2.6.6" }, { name = "pymilvus-model", marker = "extra == 'index'", specifier = ">=0.3.2" }, { name = "pymongo", marker = "extra == 'api'", specifier = ">=4.6" }, { name = "pymupdf", marker = "extra == 'process'" }, @@ -6478,11 +6482,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e5/ab/890c3e258c09981a4df875fe762166b92111fc1f9fb1e646025ebe3acb1b/pymilvus-2.6.6-py3-none-any.whl", hash = "sha256:0e61daa573b0025650f072493cb978a9ada9cdb1d450594707592174b1f297c0", size = 285098, upload-time = "2025-12-30T09:11:27.099Z" }, ] -[package.optional-dependencies] -milvus-lite = [ - { name = "milvus-lite", marker = "sys_platform != 'win32' or (extra == 'extra-5-mmore-cpu' and extra == 'extra-5-mmore-cu126')" }, -] - [[package]] name = "pymilvus-model" version = "0.3.2" From c5415a2a6e8a29af81641fd78e4661e790a0bbee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9my=20Chaverot?= Date: Tue, 19 May 2026 14:43:06 +0200 Subject: [PATCH 19/24] Fix consumed file ID when upload fails (#299) --- src/mmore/run_index_api.py | 53 +++++++++++++++++++++------- tests/test_live_retriever_api.py | 60 ++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+), 13 deletions(-) diff --git a/src/mmore/run_index_api.py b/src/mmore/run_index_api.py index 732cc16a..972e2d8a 100644 --- a/src/mmore/run_index_api.py +++ b/src/mmore/run_index_api.py @@ -87,16 +87,28 @@ async def upload_file( await file.close() + # Process and index the file + file_extension = FilePath(file.filename).suffix.lower() + try: + documents = process_files_default( + temp_dir, COLLECTION_NAME, [file_extension] + ) + except KeyError as e: + logger.warning( + "Could not process file '%s' with extension '%s'", + file.filename, + file_extension, + exc_info=True, + ) + raise HTTPException( + status_code=422, + detail=f"Could not process file '{file.filename}'", + ) from e + # Save a permanent copy for later retrieval os.makedirs(os.path.dirname(file_storage_path), exist_ok=True) shutil.copy2(temp_file_path, file_storage_path) - # Process and index the file - file_extension = FilePath(file.filename).suffix.lower() - documents = process_files_default( - temp_dir, COLLECTION_NAME, [file_extension] - ) - for doc in documents: defDocId = doc.document_id doc.document_id = fileId @@ -147,6 +159,7 @@ async def upload_files( with tempfile.TemporaryDirectory() as temp_dir: logging.info(f"Starting to process {len(files)} files with custom IDs") + temp_paths: List[FilePath] = [] for file, file_id in zip(files, listIds): if file.filename is None: raise HTTPException( @@ -163,12 +176,10 @@ async def upload_files( ) # Save to temp directory - file_name = FilePath(temp_dir) / file.filename + file_name = FilePath(temp_dir) / f"{file_id}_{file.filename}" with file_name.open("wb") as buffer: shutil.copyfileobj(file.file, buffer) - - # Save a permanent copy - shutil.copy2(file_name, file_storage_path) + temp_paths.append(file_name) # Close the file await file.close() @@ -179,9 +190,25 @@ async def upload_files( file_extensions = [ FilePath(cast(str, file.filename)).suffix.lower() for file in files ] - documents = process_files_default( - temp_dir, COLLECTION_NAME, file_extensions - ) + try: + documents = process_files_default( + temp_dir, COLLECTION_NAME, file_extensions + ) + except KeyError as e: + logger.warning( + "Could not process one of the uploaded files with extensions %s", + file_extensions, + exc_info=True, + ) + raise HTTPException( + status_code=422, + detail="Could not process one of the uploaded files", + ) from e + + # Save permanent copies + for temp_path, file_id in zip(temp_paths, listIds): + file_storage_path = FilePath(UPLOAD_DIR) / file_id + shutil.copy2(temp_path, file_storage_path) # Change the IDs to match the ones from the client modified_documents = [] diff --git a/tests/test_live_retriever_api.py b/tests/test_live_retriever_api.py index 1db80d6d..f779caf0 100644 --- a/tests/test_live_retriever_api.py +++ b/tests/test_live_retriever_api.py @@ -406,6 +406,32 @@ def test_upload_duplicate_file_returns_400(indexer_client): assert "already exists" in response.json()["detail"] +def test_upload_failed_processing_does_not_consume_id(indexer_client): + tc, upload_dir, _ = indexer_client + file_id = "id" + + response = tc.post( + "/v1/files", + data={"fileId": file_id}, + files={"file": ("file.xyz", b"bad", "application/octet-stream")}, + ) + assert response.status_code == 422 + assert not Path(upload_dir, file_id).exists() + + fake_path = str(Path(upload_dir) / "good.txt") + with patch( + "mmore.run_index_api.process_files_default", + return_value=[_fake_doc(fake_path, file_id)], + ): + response = tc.post( + "/v1/files", + data={"fileId": file_id}, + files={"file": ("file.txt", b"good", "text/plain")}, + ) + assert response.status_code == 201 + assert Path(upload_dir, file_id).read_bytes() == b"good" + + # --------------------------------------------------------------------------- # POST /v1/files/bulk # --------------------------------------------------------------------------- @@ -450,6 +476,40 @@ def test_upload_bulk_mismatched_ids_returns_400(indexer_client): assert "doesn't match" in response.json()["detail"] +def test_upload_bulk_failed_processing_does_not_consume_ids(indexer_client): + tc, upload_dir, _ = indexer_client + ids = ["id-1", "id-2"] + + response = tc.post( + "/v1/files/bulk", + data={"listIds": ",".join(ids)}, + files=[ + ("files", ("file.xyz", b"bad A", "application/octet-stream")), + ("files", ("file.xyz", b"bad B", "application/octet-stream")), + ], + ) + assert response.status_code == 422 + for file_id in ids: + assert not Path(upload_dir, file_id).exists() + + fake_paths = [str(Path(upload_dir) / f"{i}_file.txt") for i in ids] + with patch( + "mmore.run_index_api.process_files_default", + return_value=[_fake_doc(p, i) for p, i in zip(fake_paths, ids)], + ): + response = tc.post( + "/v1/files/bulk", + data={"listIds": ",".join(ids)}, + files=[ + ("files", ("file.txt", b"good A", "text/plain")), + ("files", ("file.txt", b"good B", "text/plain")), + ], + ) + assert response.status_code == 201 + assert Path(upload_dir, ids[0]).read_bytes() == b"good A" + assert Path(upload_dir, ids[1]).read_bytes() == b"good B" + + # --------------------------------------------------------------------------- # PUT /v1/files/{fileId} # --------------------------------------------------------------------------- From ca412b1753061aaf6133fb76112c9745c8610756 Mon Sep 17 00:00:00 2001 From: fabnemEPFL <117652591+fabnemEPFL@users.noreply.github.com> Date: Tue, 19 May 2026 17:53:20 +0200 Subject: [PATCH 20/24] Fix #288 (#307) Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> Co-authored-by: Copilot --- src/mmore/run_index_api.py | 106 ++++++++++++---- tests/test_live_retriever_api.py | 212 ++++++++++++++++++++++++++----- 2 files changed, 262 insertions(+), 56 deletions(-) diff --git a/src/mmore/run_index_api.py b/src/mmore/run_index_api.py index 972e2d8a..a6dacf76 100644 --- a/src/mmore/run_index_api.py +++ b/src/mmore/run_index_api.py @@ -4,7 +4,7 @@ import shutil import tempfile from pathlib import Path as FilePath -from typing import List, cast +from typing import List import uvicorn from fastapi import APIRouter, FastAPI, File, Form, HTTPException, Path, UploadFile @@ -23,6 +23,7 @@ from .process.processors import register_all_processors from .rag.retriever import RetrieverConfig +from .type import MultimodalSample from .utils import get_indexer, load_config, process_files_default UPLOAD_DIR: str = "./uploads" @@ -34,6 +35,18 @@ logger = logging.getLogger(__name__) +def _apply_uploaded_file_metadata( + documents: List[MultimodalSample], file_id: str, filename: str +) -> None: + """Bind processed chunks to the API file ID and persist the original filename.""" + for doc in documents: + chunk_id = doc.id.rsplit("+")[1] if "+" in doc.id else None + doc.document_id = file_id + doc.id = f"{file_id}+{chunk_id}" if chunk_id else file_id + + doc.metadata.extra["filename"] = filename + + def make_router(config_path: str) -> APIRouter: router = APIRouter() @@ -109,10 +122,13 @@ async def upload_file( os.makedirs(os.path.dirname(file_storage_path), exist_ok=True) shutil.copy2(temp_file_path, file_storage_path) - for doc in documents: - defDocId = doc.document_id - doc.document_id = fileId - doc.id = doc.id.replace(defDocId, fileId) + # Process and index the file + file_extension = FilePath(file.filename).suffix.lower() + documents = process_files_default( + temp_dir, COLLECTION_NAME, [file_extension] + ) + + _apply_uploaded_file_metadata(documents, fileId, file.filename) # Get indexer and index the document try: @@ -148,7 +164,12 @@ async def upload_files( Upload multiple files with custom IDs and index them. """ try: - listIds = listIds[0].split(",") + listIds = [ + file_id.strip() + for ids in listIds + for file_id in ids.split(",") + if file_id.strip() + ] # Check if IDs and files match in number if len(listIds) != len(files): raise HTTPException( @@ -159,13 +180,15 @@ async def upload_files( with tempfile.TemporaryDirectory() as temp_dir: logging.info(f"Starting to process {len(files)} files with custom IDs") - temp_paths: List[FilePath] = [] - for file, file_id in zip(files, listIds): + uploaded_files: list[dict[str, str]] = [] + file_info_by_temp_path = {} + for index, (file, file_id) in enumerate(zip(files, listIds)): if file.filename is None: raise HTTPException( status_code=422, detail=f"File {file_id} does not have a filename", ) + filename = file.filename # Check if file with this ID already exists file_storage_path = FilePath(UPLOAD_DIR) / file_id @@ -176,10 +199,19 @@ async def upload_files( ) # Save to temp directory - file_name = FilePath(temp_dir) / f"{file_id}_{file.filename}" - with file_name.open("wb") as buffer: + temp_file_path = ( + FilePath(temp_dir) / f"{index}{FilePath(filename).suffix}" + ) + file_info = { + "fileId": file_id, + "filename": filename, + "temp_path": str(temp_file_path.resolve()), + } + uploaded_files.append(file_info) + file_info_by_temp_path[file_info["temp_path"]] = file_info + + with temp_file_path.open("wb") as buffer: shutil.copyfileobj(file.file, buffer) - temp_paths.append(file_name) # Close the file await file.close() @@ -188,7 +220,8 @@ async def upload_files( # Process the documents file_extensions = [ - FilePath(cast(str, file.filename)).suffix.lower() for file in files + FilePath(file_info["temp_path"]).suffix.lower() + for file_info in uploaded_files ] try: documents = process_files_default( @@ -206,16 +239,34 @@ async def upload_files( ) from e # Save permanent copies - for temp_path, file_id in zip(temp_paths, listIds): - file_storage_path = FilePath(UPLOAD_DIR) / file_id - shutil.copy2(temp_path, file_storage_path) + for file_info in uploaded_files: + file_storage_path = FilePath(UPLOAD_DIR) / file_info["fileId"] + shutil.copy2(file_info["temp_path"], file_storage_path) # Change the IDs to match the ones from the client modified_documents = [] - for doc, docId in zip(documents, listIds): - defDocId = doc.document_id - doc.document_id = docId - doc.id = doc.id.replace(defDocId, docId) + text_by_file_id = {} + chunks_by_file_id = { + file_info["fileId"]: 0 for file_info in uploaded_files + } + for doc_index, doc in enumerate(documents): + doc_temp_path = str(FilePath(doc.metadata.file_path).resolve()) + file_info = file_info_by_temp_path.get(doc_temp_path) + if file_info is None: + if doc_index >= len(uploaded_files): + raise HTTPException( + status_code=500, + detail=( + "Could not match processed document " + f"{doc.metadata.file_path} to an uploaded file" + ), + ) + # Fallback for processors/tests that return file paths outside temp_dir. + file_info = uploaded_files[doc_index] + doc_id = file_info["fileId"] + _apply_uploaded_file_metadata([doc], doc_id, file_info["filename"]) + text_by_file_id.setdefault(doc_id, doc.text) + chunks_by_file_id[doc_id] += 1 modified_documents.append(doc) logging.info("Indexing the files") @@ -232,10 +283,16 @@ async def upload_files( return { "status": "success", - "message": f"Successfully processed and indexed {len(modified_documents)} documents", + "message": f"Successfully processed and indexed {len(uploaded_files)} files", "documents": [ - {"fileId": doc.document_id, "text": doc.text[:50] + "..."} - for doc in modified_documents + { + "fileId": file_info["fileId"], + "filename": file_info["filename"], + "text": text_by_file_id.get(file_info["fileId"], "")[:50] + + "...", + "chunks": chunks_by_file_id[file_info["fileId"]], + } + for file_info in uploaded_files ], } @@ -284,9 +341,8 @@ async def update_file( temp_dir, COLLECTION_NAME, [file_extension] ) - # Set the custom ID - for doc in documents: - doc.id = fileId + # Set the custom ID and preserve the original upload filename + _apply_uploaded_file_metadata(documents, fileId, file.filename) # Get indexer and reindex the document try: diff --git a/tests/test_live_retriever_api.py b/tests/test_live_retriever_api.py index f779caf0..f812c0ee 100644 --- a/tests/test_live_retriever_api.py +++ b/tests/test_live_retriever_api.py @@ -18,9 +18,14 @@ from mmore.index.indexer import Indexer from mmore.rag.model import DenseModelConfig, SparseModelConfig -from mmore.run_index_api import make_router as make_index_router +from mmore.run_index_api import ( + _apply_uploaded_file_metadata, +) +from mmore.run_index_api import ( + make_router as make_index_router, +) from mmore.run_retriever import make_router, save_results -from mmore.type import MultimodalSample +from mmore.type import DocumentMetadata, MultimodalSample _COLLECTION = "my_docs" @@ -216,13 +221,16 @@ def test_save_results_writes_valid_json(tmp_path): docs = [ Document( page_content="Paris is the capital.", - metadata={ - "rank": 1, - "similarity": 0.9, - "id": "1", - "page_numbers": [], - "paragraph_numbers": [], - }, + metadata=DocumentMetadata( + file_path="paris.txt", + extra={ + "rank": 1, + "similarity": 0.9, + "id": "1", + "page_numbers": [], + "paragraph_numbers": [], + }, + ).to_dict(), ) ] results = [docs] @@ -246,25 +254,31 @@ def test_save_results_multiple_queries(tmp_path): [ Document( page_content="doc A", - metadata={ - "rank": 1, - "similarity": 0.8, - "id": "a", - "page_numbers": [], - "paragraph_numbers": [], - }, + metadata=DocumentMetadata( + file_path="doc-a.txt", + extra={ + "rank": 1, + "similarity": 0.8, + "id": "a", + "page_numbers": [], + "paragraph_numbers": [], + }, + ).to_dict(), ) ], [ Document( page_content="doc B", - metadata={ - "rank": 1, - "similarity": 0.7, - "id": "b", - "page_numbers": [], - "paragraph_numbers": [], - }, + metadata=DocumentMetadata( + file_path="doc-b.txt", + extra={ + "rank": 1, + "similarity": 0.7, + "id": "b", + "page_numbers": [], + "paragraph_numbers": [], + }, + ).to_dict(), ) ], ] @@ -294,10 +308,21 @@ def _fake_doc(file_path: str, document_id: str = "doc") -> MultimodalSample: document_id=document_id, text="Test document content.", modalities=[], - metadata={"file_path": file_path}, + metadata=DocumentMetadata(file_path=file_path), ) +def test_apply_uploaded_file_metadata_preserves_chunk_suffix(): + doc = _fake_doc("/tmp/original-name.txt", document_id="default-doc") + doc.id = "processor-generated-id+7" + + _apply_uploaded_file_metadata([doc], "client-doc", "original-name.txt") + + assert doc.document_id == "client-doc" + assert doc.id == "client-doc+7" + assert doc.metadata.extra["filename"] == "original-name.txt" + + @pytest.fixture(scope="module") def indexer_client(tmp_path_factory): """Builds the indexer FastAPI app.""" @@ -392,6 +417,81 @@ def test_upload_file_success(indexer_client): assert Path(upload_dir, "new-doc").exists() +def test_uploaded_file_has_filename_in_list_files(tmp_path): + upload_dir = tmp_path / "uploads" + upload_dir.mkdir() + db_path = str(tmp_path / "uploaded_list_files.db") + config_file = tmp_path / "config.yaml" + cfg = { + "db": {"uri": db_path, "name": "my_db"}, + "hybrid_search_weight": 0.5, + "k": 2, + "collection_name": _COLLECTION, + "use_web": False, + "reranker_model_name": None, + } + with open(config_file, "w") as f: + yaml.dump(cfg, f) + + with ExitStack() as stack: + stack.enter_context( + patch( + "mmore.index.indexer.SparseModel.from_config", + return_value=FakeSparseEmbedding(), + ) + ) + milvus_client = MilvusClient(db_path, enable_sparse=True) + the_indexer = Indexer( + dense_model_config=DenseModelConfig(model_name="debug"), + sparse_model_config=SparseModelConfig( + model_name="naver/splade-cocondenser-selfdistil" + ), + client=milvus_client, + ) + stack.enter_context(patch("mmore.run_index_api.UPLOAD_DIR", str(upload_dir))) + stack.enter_context(patch("mmore.run_index_api.register_all_processors")) + stack.enter_context( + patch("mmore.run_index_api.get_indexer", return_value=the_indexer) + ) + + index_app = FastAPI() + index_app.include_router(make_index_router(str(config_file))) + index_client = TestClient(index_app, raise_server_exceptions=False) + + uploaded_path = str(upload_dir / "listed-doc.txt") + stack.enter_context( + patch( + "mmore.run_index_api.process_files_default", + return_value=[_fake_doc(uploaded_path)], + ) + ) + response = index_client.post( + "/v1/files", + data={"fileId": "listed-doc"}, + files={"file": ("listed-doc.txt", b"Hello list files", "text/plain")}, + ) + assert response.status_code == 201 + + stack.enter_context( + patch( + "mmore.rag.retriever.SparseModel.from_config", + return_value=FakeSparseEmbedding(), + ) + ) + retriever_app = FastAPI() + retriever_app.include_router(make_router(str(config_file))) + retriever_client = TestClient(retriever_app) + + response = retriever_client.get( + "/list_files", params={"collection_name": _COLLECTION} + ) + + assert response.status_code == 200 + files_by_id = {file["id"]: file["filename"] for file in response.json()} + assert files_by_id["listed-doc"] == "listed-doc.txt" + assert files_by_id["listed-doc"] != "Unknown" + + def test_upload_duplicate_file_returns_400(indexer_client): tc, upload_dir, _ = indexer_client duplicate_id = "duplicate-doc" @@ -438,16 +538,25 @@ def test_upload_failed_processing_does_not_consume_id(indexer_client): def test_upload_bulk_files_success(indexer_client): - tc, upload_dir, _ = indexer_client - fake_path_1 = str(Path(upload_dir) / "bulk-1.txt") - fake_path_2 = str(Path(upload_dir) / "bulk-2.txt") + tc, *_ = indexer_client + + def fake_process(temp_dir, collection_name, extensions): + first_path, second_path = sorted(Path(temp_dir).iterdir()) + return [ + _fake_doc(str(first_path), "bulk-1"), + MultimodalSample( + id="bulk-1+1", + document_id="bulk-1", + text="Second chunk from the first bulk document.", + modalities=[], + metadata=DocumentMetadata(file_path=str(first_path)), + ), + _fake_doc(str(second_path), "bulk-2"), + ] with patch( "mmore.run_index_api.process_files_default", - return_value=[ - _fake_doc(fake_path_1, "bulk-1"), - _fake_doc(fake_path_2, "bulk-2"), - ], + side_effect=fake_process, ): response = tc.post( "/v1/files/bulk", @@ -459,6 +568,47 @@ def test_upload_bulk_files_success(indexer_client): ) assert response.status_code == 201 + data = response.json() + documents_by_id = {doc["fileId"]: doc for doc in data["documents"]} + assert set(documents_by_id) == {"bulk-1", "bulk-2"} + assert documents_by_id["bulk-1"]["filename"] == "bulk-1.txt" + assert documents_by_id["bulk-1"]["chunks"] == 2 + assert documents_by_id["bulk-2"]["filename"] == "bulk-2.txt" + assert documents_by_id["bulk-2"]["chunks"] == 1 + + +def test_upload_bulk_files_allows_duplicate_uploaded_filenames(indexer_client): + tc, upload_dir, _ = indexer_client + + def fake_process(temp_dir, collection_name, extensions): + return [ + _fake_doc(str(path), f"processed-{path.stem}") + for path in sorted(Path(temp_dir).iterdir()) + ] + + with patch( + "mmore.run_index_api.process_files_default", + side_effect=fake_process, + ): + response = tc.post( + "/v1/files/bulk", + data={"listIds": "same-name-1,same-name-2"}, + files=[ + ("files", ("same.txt", b"First content", "text/plain")), + ("files", ("same.txt", b"Second content", "text/plain")), + ], + ) + + assert response.status_code == 201 + data = response.json() + documents_by_id = {doc["fileId"]: doc for doc in data["documents"]} + assert set(documents_by_id) == {"same-name-1", "same-name-2"} + assert documents_by_id["same-name-1"]["filename"] == "same.txt" + assert documents_by_id["same-name-1"]["chunks"] == 1 + assert documents_by_id["same-name-2"]["filename"] == "same.txt" + assert documents_by_id["same-name-2"]["chunks"] == 1 + assert Path(upload_dir, "same-name-1").exists() + assert Path(upload_dir, "same-name-2").exists() def test_upload_bulk_mismatched_ids_returns_400(indexer_client): From 48606246f814352a7264abd37835aa716e40daf9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9my=20Chaverot?= Date: Wed, 20 May 2026 22:18:10 +0200 Subject: [PATCH 21/24] Add workflow for Pyright type check (#300) Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> Co-authored-by: fabnemEPFL <117652591+fabnemEPFL@users.noreply.github.com> --- .github/workflows/pyright.yml | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 .github/workflows/pyright.yml diff --git a/.github/workflows/pyright.yml b/.github/workflows/pyright.yml new file mode 100644 index 00000000..a0a1902b --- /dev/null +++ b/.github/workflows/pyright.yml @@ -0,0 +1,34 @@ +name: 📐 Pyright type checks +on: + push: + branches: + - master + pull_request: + workflow_dispatch: + +jobs: + pyright: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + + - name: Set up Python 3.12 + uses: actions/setup-python@v6 + with: + python-version: "3.12" + + - name: Install uv and create venv + run: | + pipx install uv + uv venv .venv + + - name: Install dependencies + run: | + source .venv/bin/activate + uv pip install -e ".[process,index,rag,api,cpu,dev,websearch]" + + - name: Run Pyright + continue-on-error: true + run: | + source .venv/bin/activate + pyright From 9f4a0bf82432c1c37f17c58d5a12870bedbcaff7 Mon Sep 17 00:00:00 2001 From: Arthur PERRIN Date: Thu, 28 May 2026 16:45:59 +0200 Subject: [PATCH 22/24] address JCHAVEROT review comments on TUI - wire run_setup_wizard() in app.py dispatch (elif mode == "setup") - add uv to tui extra in pyproject.toml so setup wizard can run uv sync - replace .env file generation with export command hints (mmore does not use dotenv for secrets; exporting is simpler and safer) - remove ({e.name}) from cli.py TUI missing-dep error message - simplify README install snippet to uv sync --extra tui; remove implementation detail paragraph (already covered in for_devs.md) --- README.md | 4 +- pyproject.toml | 1 + src/mmore/cli.py | 4 +- src/mmore/tui/app.py | 4 ++ src/mmore/tui/setup.py | 86 ++++++++++++------------------------------ 5 files changed, 32 insertions(+), 67 deletions(-) diff --git a/README.md b/README.md index ecf3c5d6..be8cd84f 100644 --- a/README.md +++ b/README.md @@ -110,7 +110,7 @@ uv pip install "mmore[process,cpu]" Prefer a guided experience over editing YAML by hand? Install the `tui` extra and launch the interactive Terminal UI: ```bash -uv sync --extra tui --extra process --extra index --extra cpu +uv sync --extra tui mmore tui ``` @@ -121,8 +121,6 @@ From the launcher you can: - generate stage YAML configs through a guided wizard, - pick from existing example configs without leaving the terminal. -Generated configs land in `./tui-configs/` and are validated against the stage's dataclass before any run. Stages whose extras are missing are greyed out in the menu with the exact `uv sync --extra ...` command to enable them. Press `Ctrl-C` inside a sub-flow to cancel and return to the main menu; press it at the main menu to quit. - ### Minimal Example You can use our predefined CLI commands to execute parts of the pipeline. Note that you might need to prepend `python -m` to the command if the package does not properly create bash aliases. diff --git a/pyproject.toml b/pyproject.toml index 414ef7d8..2a22fe4b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -132,6 +132,7 @@ tui = [ # Interactive terminal launcher (`mmore tui`) "questionary>=2.0", "rich>=13", + "uv", ] all = [ diff --git a/src/mmore/cli.py b/src/mmore/cli.py index d6333a69..1030a465 100644 --- a/src/mmore/cli.py +++ b/src/mmore/cli.py @@ -272,9 +272,7 @@ def tui(): from .tui import run except ModuleNotFoundError as e: if e.name in ("questionary", "rich", "prompt_toolkit"): - click.echo( - f"TUI dependency missing ({e.name}). Install with: uv sync --extra tui" - ) + click.echo("TUI dependency missing. Install with: uv sync --extra tui") raise SystemExit(1) raise run() diff --git a/src/mmore/tui/app.py b/src/mmore/tui/app.py index 4f0f286c..0621f163 100644 --- a/src/mmore/tui/app.py +++ b/src/mmore/tui/app.py @@ -287,6 +287,10 @@ def run() -> None: _run_full_wizard() elif mode == "chat": _chat_only() + elif mode == "setup": + from mmore.tui.setup import run_setup_wizard + + run_setup_wizard() except (UserCancelledError, KeyboardInterrupt): console.print(f"[{ACCENT2}]cancelled — back to menu.[/]") continue diff --git a/src/mmore/tui/setup.py b/src/mmore/tui/setup.py index cffeb224..beea7848 100644 --- a/src/mmore/tui/setup.py +++ b/src/mmore/tui/setup.py @@ -1,11 +1,10 @@ -"""Setup wizard: install extras + generate .env in one guided flow.""" +"""Setup wizard: install extras + print export commands in one guided flow.""" from __future__ import annotations import os import subprocess import sys -from pathlib import Path import questionary from rich.panel import Panel @@ -185,54 +184,18 @@ def _collect_env_vars(stages: list[str]) -> dict[str, str]: return env_vars -def _write_dotenv(env_vars: dict[str, str], path: str = ".env") -> str: - """Write or merge env vars into a .env file. +def _print_export_commands(env_vars: dict[str, str]) -> None: + """Print export commands for the collected env vars. - Existing variables in the file are preserved; new ones are appended. + Displays a table with masked values, then prints the shell commands + the user can copy-paste into their shell or profile file. """ - existing: dict[str, str] = {} - lines: list[str] = [] - env_path = Path(path) - - if env_path.exists(): - raw = env_path.read_text() - for line in raw.splitlines(): - stripped = line.strip() - if stripped and not stripped.startswith("#") and "=" in stripped: - key = stripped.split("=", 1)[0].strip() - existing[key] = line - lines.append(line) - - # Append new vars - added = [] - for key, value in env_vars.items(): - if key in existing: - continue # don't overwrite existing - # Quote values that contain spaces - if " " in value: - entry = f'{key}="{value}"' - else: - entry = f"{key}={value}" - lines.append(entry) - added.append(key) - - if lines and not lines[-1].endswith("\n"): - content = "\n".join(lines) + "\n" - else: - content = "\n".join(lines) - - env_path.write_text(content) - return str(env_path) - - -def _preview_dotenv(env_vars: dict[str, str]) -> None: - """Show what will be written to .env.""" if not env_vars: - console.print(" [dim]No environment variables to write.[/dim]") + console.print(" [dim]No environment variables needed.[/dim]") return table = Table( - title="[bold].env preview[/bold]", + title="[bold]Environment variables[/bold]", title_style=ACCENT2, border_style=ACCENT, show_lines=False, @@ -241,7 +204,7 @@ def _preview_dotenv(env_vars: dict[str, str]) -> None: table.add_column("Value", style=MUTED) for key, value in env_vars.items(): - # Mask API keys + # Mask API keys and tokens if "KEY" in key or "TOKEN" in key: display = value[:4] + "…" + value[-4:] if len(value) > 8 else "****" else: @@ -249,16 +212,28 @@ def _preview_dotenv(env_vars: dict[str, str]) -> None: table.add_row(key, display) console.print(table) + console.print() + console.print( + Panel( + "\n".join( + f'export {k}="{v}"' if " " in v else f"export {k}={v}" + for k, v in env_vars.items() + ), + title="[bold]Add to your shell profile (e.g. ~/.bashrc or ~/.zshrc)[/bold]", + border_style=ACCENT, + padding=(1, 2), + ) + ) def run_setup_wizard() -> None: - """Full setup wizard: pick stages → install deps → generate .env.""" + """Full setup wizard: pick stages → install deps → print export commands.""" console.print( Panel( Text( "This wizard will:\n" " 1. Install the right Python dependencies for your pipeline\n" - " 2. Generate a .env file with the required environment variables", + " 2. Show the environment variables you need to export", ), title="[bold]Setup wizard[/bold]", border_style=ACCENT2, @@ -289,26 +264,15 @@ def run_setup_wizard() -> None: if _confirm("Install dependencies now?", default=True): if not _install_deps(stages, compute): if not _confirm( - "Continue to .env setup despite install failure?", default=False + "Continue to env var setup despite install failure?", default=False ): return # Step 4: collect env vars env_vars = _collect_env_vars(stages) - # Step 5: preview and write .env - if env_vars: - _preview_dotenv(env_vars) - env_path = _prompt(".env file path", default=".env") - if _confirm(f"Write {len(env_vars)} variable(s) to {env_path}?", default=True): - written = _write_dotenv(env_vars, env_path) - console.print(f" [{OK}]✓[/] Saved to {written}") - else: - console.print(" [dim]Skipped .env generation.[/dim]") - else: - console.print( - " [dim]No environment variables needed for selected stages.[/dim]" - ) + # Step 5: print export commands + _print_export_commands(env_vars) console.print( f"\n [{OK}]✓ Setup complete![/] Run [bold]mmore tui[/bold] to start.\n" From 1a68bd88eecbe9ebd3b37fcba3ed84618cffc69d Mon Sep 17 00:00:00 2001 From: ArthurPerrin Date: Thu, 28 May 2026 18:51:18 +0200 Subject: [PATCH 23/24] Update setup choice text in app.py --- src/mmore/tui/app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mmore/tui/app.py b/src/mmore/tui/app.py index 0621f163..f97fa526 100644 --- a/src/mmore/tui/app.py +++ b/src/mmore/tui/app.py @@ -253,7 +253,7 @@ def _main_menu() -> str | None: chat_choice, questionary.Separator(), questionary.Choice( - "🔧 Setup (install deps + generate .env)", value="setup" + "🔧 Setup (install dependencies)", value="setup" ), questionary.Choice("✕ Quit", value="quit"), ], From 25fcd5aa0e01bf19056d9cccb8a352e418b9df77 Mon Sep 17 00:00:00 2001 From: JCHAVEROT Date: Thu, 28 May 2026 19:01:46 +0200 Subject: [PATCH 24/24] chores: fix linter --- src/mmore/tui/app.py | 4 +--- uv.lock | 29 +++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/src/mmore/tui/app.py b/src/mmore/tui/app.py index f97fa526..37186654 100644 --- a/src/mmore/tui/app.py +++ b/src/mmore/tui/app.py @@ -252,9 +252,7 @@ def _main_menu() -> str | None: wizard_choice, chat_choice, questionary.Separator(), - questionary.Choice( - "🔧 Setup (install dependencies)", value="setup" - ), + questionary.Choice("🔧 Setup (install dependencies)", value="setup"), questionary.Choice("✕ Quit", value="quit"), ], style=QSTYLE, diff --git a/uv.lock b/uv.lock index 8f887fea..1e1ff7b4 100644 --- a/uv.lock +++ b/uv.lock @@ -3703,6 +3703,7 @@ all = [ { name = "trafilatura" }, { name = "transformers" }, { name = "unidecode" }, + { name = "uv" }, { name = "uvicorn" }, { name = "xlrd" }, ] @@ -3806,6 +3807,7 @@ rag = [ tui = [ { name = "questionary" }, { name = "rich" }, + { name = "uv" }, ] websearch = [ { name = "ddgs" }, @@ -3902,6 +3904,7 @@ requires-dist = [ { name = "transformers", marker = "extra == 'process'", specifier = ">=4.44" }, { name = "typing-extensions", specifier = ">=4.15.0,<5.0" }, { name = "unidecode", marker = "extra == 'process'" }, + { name = "uv", marker = "extra == 'tui'" }, { name = "uvicorn", marker = "extra == 'api'", specifier = ">=0.29" }, { name = "validators", specifier = ">=0.28" }, { name = "xlrd", marker = "extra == 'process'", specifier = ">=2.0.1" }, @@ -9145,6 +9148,32 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ed/d0/5bf7cbf1ac138c92b9ac21066d18faf4d7e7f651047b700eb192ca4b9fdb/uuid_utils-0.14.1-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:258186964039a8e36db10810c1ece879d229b01331e09e9030bc5dcabe231bd2", size = 364700, upload-time = "2026-02-20T22:50:21.732Z" }, ] +[[package]] +name = "uv" +version = "0.11.16" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a1/99/025154611a4bd97a23851574c15d73bb71ada09d35f092d6972f9ac87f70/uv-0.11.16.tar.gz", hash = "sha256:4b435fcb0af8f34833dcc1903a8a223856437efd0d515c2160a2871def221238", size = 4177038, upload-time = "2026-05-21T22:10:01.009Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/55/e3/8b8cfc802bc476c67e31a39725538193265cf3a19585b4a60c232659f919/uv-0.11.16-py3-none-linux_armv6l.whl", hash = "sha256:c9e9d9cb73ee8cd2ad696dbf1bc3232abaac363270557684b6b85a2bdb8eb276", size = 23508087, upload-time = "2026-05-21T22:10:06.227Z" }, + { url = "https://files.pythonhosted.org/packages/45/78/d5ca91c636ac88e902b6b3ff31ad32d2d02663232d844aff871467a323d2/uv-0.11.16-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:01172238a75e42a5a55d12555cd9ec98bee24249f3645b98a4b32eb5f1ff5e43", size = 23028989, upload-time = "2026-05-21T22:09:50.127Z" }, + { url = "https://files.pythonhosted.org/packages/c7/26/c84580dfec5a87c36fb1218eac17c5194fa3e58e2a9232cf085d69eb6bed/uv-0.11.16-py3-none-macosx_11_0_arm64.whl", hash = "sha256:c75f9b5bac49b97131973910c220feac60fe47b10a333941b237ff0ae4b36721", size = 21572023, upload-time = "2026-05-21T22:09:58.703Z" }, + { url = "https://files.pythonhosted.org/packages/84/68/ba2bdc64fea96ef8c9796a991f244541b65bb9d31c661b322cc724857a4e/uv-0.11.16-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.musllinux_1_1_aarch64.whl", hash = "sha256:a801484f4507b6c2133e557350f3143b61b8f8b61dddb01ff7b84a74cdfab1fb", size = 23289936, upload-time = "2026-05-21T22:10:15.423Z" }, + { url = "https://files.pythonhosted.org/packages/c9/81/74922f693d5804a77d009338ca8dc709eff871fb60d9f2c263dede8d77d1/uv-0.11.16-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.musllinux_1_1_armv7l.whl", hash = "sha256:eb538069e768b042cf870be700a210518ce628e36d99d9a83b85acaf484d7f6a", size = 23020906, upload-time = "2026-05-21T22:10:24.242Z" }, + { url = "https://files.pythonhosted.org/packages/60/81/cda8886f5df4dd28854a9b97bcc3ee6a7d1b5b5b23aaaccfbf1ed3e5e2bf/uv-0.11.16-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d7cdb23457a4d1bc76bf1016638ea1d1ada0e8e032f656168e933d4d17c47e72", size = 23004220, upload-time = "2026-05-21T22:10:32.847Z" }, + { url = "https://files.pythonhosted.org/packages/98/7c/65837e07de23f0a40ab860bc6601f7c022d4bcf4b97ca79b6c35a2e72e65/uv-0.11.16-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:451327388d59ac3041cbda474296f3ceeafac5b1f645476198e7b95f504fcfd5", size = 24319651, upload-time = "2026-05-21T22:10:21.492Z" }, + { url = "https://files.pythonhosted.org/packages/85/70/9d364542bf118433b60ed71422e47d2c8c470aca7d3aef0df9449a5f726a/uv-0.11.16-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e7992b8276149b3ffaf35ce9434702d3e16bae6ec393e99df209b870a7e19eb0", size = 25359517, upload-time = "2026-05-21T22:09:46.519Z" }, + { url = "https://files.pythonhosted.org/packages/99/b4/650896e8cff5a3289cee860c41fd9876da83ca628c5871f9a61d5fc75c72/uv-0.11.16-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:83a8db9b3314d900e7a240105afce43f806c9e04c59ea10a40bdbdca84c6d0c5", size = 24563421, upload-time = "2026-05-21T22:10:35.82Z" }, + { url = "https://files.pythonhosted.org/packages/b1/7d/184711a8c02466e1486d57efdc9394ce09cbf43ee2c5794da70bd25db3fb/uv-0.11.16-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9b10086165189c39c53142a0e2f34e0b8889ef681886f589ed17be45a1a774c7", size = 24676607, upload-time = "2026-05-21T22:10:39.784Z" }, + { url = "https://files.pythonhosted.org/packages/ee/3f/5b338df6505f77f73c20eae38cb29f57d14dba56dac835386e3dc6e2a5d6/uv-0.11.16-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:cfe1f06fb8f135a735a961065d5ee90f99cccf41749fb1f964edb5b3c3dae19b", size = 23401615, upload-time = "2026-05-21T22:10:30.124Z" }, + { url = "https://files.pythonhosted.org/packages/b6/f9/54bbcbc77443dc76468f09a49cc9f4f92ca49b4159a011c6010d223de4ea/uv-0.11.16-py3-none-manylinux_2_31_riscv64.musllinux_1_1_riscv64.whl", hash = "sha256:2454f80d8b548fb2e246151578809b14ad4395b3f357d738bae1af11918e91af", size = 24104468, upload-time = "2026-05-21T22:09:53.323Z" }, + { url = "https://files.pythonhosted.org/packages/3e/0a/b5f105514fddea5110fe3947cd18a9f199ff93dbad78e5e5a08e1b5d0ea2/uv-0.11.16-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:4249d57a563165d368050680deeb722f9c0053a0dbf3244b11cca3e6d85a3c7d", size = 24164861, upload-time = "2026-05-21T22:10:09.458Z" }, + { url = "https://files.pythonhosted.org/packages/f6/01/15d4ca2be7257862b077a9077ac31ce81c419f35ef7994e76356a317716b/uv-0.11.16-py3-none-musllinux_1_1_i686.whl", hash = "sha256:374c30126483ce95675c5de49e54c2454ddedb01c17b8321417fe4eb9da83406", size = 23644919, upload-time = "2026-05-21T22:10:03.129Z" }, + { url = "https://files.pythonhosted.org/packages/49/bf/9de3e262e6ff93aec2e0a4c238857293fd2c616dd79f25bb440f126bf32c/uv-0.11.16-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:746edfc9d1d8cd03dd58739989f634d3580648048d09f81a9c68da74c4eb9d62", size = 24973746, upload-time = "2026-05-21T22:10:18.413Z" }, + { url = "https://files.pythonhosted.org/packages/f6/7d/f4126dce104f1b5d0b451ce3ca41c4db69b963c2e78c3465fcda6440de31/uv-0.11.16-py3-none-win32.whl", hash = "sha256:50299b20aab2d28c05ff27d781ce2af3f5af2102bc304dc07a4ad54b05e2af8a", size = 22400991, upload-time = "2026-05-21T22:10:27.119Z" }, + { url = "https://files.pythonhosted.org/packages/8f/38/99627cb995a03389b227ce4b12b08e770565d0aa7850cd0420973194a638/uv-0.11.16-py3-none-win_amd64.whl", hash = "sha256:e901aafa5007beffafe57bfa44e5e248d99fb5d97036a3718fd65cf9723c5cd3", size = 25067163, upload-time = "2026-05-21T22:10:12.317Z" }, + { url = "https://files.pythonhosted.org/packages/b6/68/3ed1c0bdfb4bec501e5cde73419b4f39c8a125ef905a85fc0f239f19eb9b/uv-0.11.16-py3-none-win_arm64.whl", hash = "sha256:d777cb29661cdfa7f90dae77406c85fb5b729bf8bc13941dc237958a1ea1ba00", size = 23502015, upload-time = "2026-05-21T22:09:56.014Z" }, +] + [[package]] name = "uvicorn" version = "0.42.0"