diff --git a/.gitignore b/.gitignore index a490b5de..eaf88e7e 100644 --- a/.gitignore +++ b/.gitignore @@ -114,6 +114,7 @@ venv.bak/ # Milvus DB db/ *.db +*.db.lock # Project files tmp/ diff --git a/README.md b/README.md index 84565f5e..be8cd84f 100644 --- a/README.md +++ b/README.md @@ -105,6 +105,22 @@ uv pip install "mmore[process,cpu]" > :warning: **Check the instructions for contributors directly at [`docs/for_devs.md`](./docs/for_devs.md)** +### Interactive TUI + +Prefer a guided experience over editing YAML by hand? Install the `tui` extra and launch the interactive Terminal UI: + +```bash +uv sync --extra tui +mmore tui +``` + +From the launcher you can: + +- run any stage (process / postprocess / index / rag / chat) interactively, +- chain the full pipeline (process β†’ postprocess β†’ index β†’ chat), +- generate stage YAML configs through a guided wizard, +- pick from existing example configs without leaving the terminal. + ### Minimal Example You can use our predefined CLI commands to execute parts of the pipeline. Note that you might need to prepend `python -m` to the command if the package does not properly create bash aliases. diff --git a/docs/source/developer_documentation/for_devs.md b/docs/source/developer_documentation/for_devs.md index ecd179c4..5d9949ff 100644 --- a/docs/source/developer_documentation/for_devs.md +++ b/docs/source/developer_documentation/for_devs.md @@ -31,6 +31,7 @@ This guide will help you set up your development environment and contribute to t - [Writing tests](#writing-tests) - [πŸ”€ Pull Request Process](#-pull-request-process) - [PR checklist](#pr-checklist) + - [πŸ–₯️ Interactive TUI](#️-interactive-tui) - [πŸ’‘ Development tips](#-development-tips) - [Working with `uv`](#working-with-uv) - [❓ Questions](#-questions) @@ -256,6 +257,25 @@ def test_something_on_gpu(): - [ ] Examples are provided for new features - [ ] Commit messages are clear and descriptive +## πŸ–₯️ Interactive TUI + +MMORE ships with a Terminal UI that wraps the CLI commands behind guided menus and config wizards. Useful for trying the pipeline without writing YAML by hand. + +Launch it from a project working directory: + +```bash +mmore tui +``` + +From the main menu you can: + +- **Run a single command** β€” pick any stage (`process`, `postprocess`, `index`, `retrieve`, `rag`, `ragcli`, `websearch`), then either select an existing YAML, generate one through a guided wizard, or type a path manually. Generated configs are written to `./tui-configs/` and validated against the stage's dataclass before running. +- **Run full pipeline** β€” chains `process β†’ postprocess β†’ index` using existing configs. +- **Build a full pipeline config (guided wizard)** β€” walks through the three stages in order, wiring the postprocess output JSONL into the index config automatically. +- **Chat with indexed documents** β€” shortcut to `ragcli`. + +Stages whose extras are missing are disabled in the menu with an install hint (e.g. `uv sync --extra rag --extra cpu`). Press `Ctrl-C` inside any sub-flow to cancel back to the main menu; press it again at the main menu to quit. + ## πŸ’‘ Development tips ### Working with `uv` diff --git a/pyproject.toml b/pyproject.toml index bb638af2..2a22fe4b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,7 +41,7 @@ dependencies = [ "python-dotenv>=1.0", "typing_extensions>=4.15.0,<5.0", "PyYAML>=6.0", - "setuptools<81" + "setuptools<81", ] [project.optional-dependencies] @@ -128,8 +128,15 @@ api = [ # --- Composite + variant extras --- +tui = [ + # Interactive terminal launcher (`mmore tui`) + "questionary>=2.0", + "rich>=13", + "uv", +] + all = [ - "mmore[process,rag,api,websearch]", + "mmore[process,rag,api,websearch,tui]", ] cpu = [ diff --git a/src/mmore/cli.py b/src/mmore/cli.py index ad952f58..1030a465 100644 --- a/src/mmore/cli.py +++ b/src/mmore/cli.py @@ -265,6 +265,19 @@ def ragcli(config_file: str): my_rag_cli.launch_cli() +@main.command() +def tui(): + """Launch the interactive Terminal UI.""" + try: + from .tui import run + except ModuleNotFoundError as e: + if e.name in ("questionary", "rich", "prompt_toolkit"): + click.echo("TUI dependency missing. Install with: uv sync --extra tui") + raise SystemExit(1) + raise + run() + + @main.group() def colpali(): """ColPali pipeline commands for PDF processing, indexing, and retrieval.""" diff --git a/src/mmore/run_process.py b/src/mmore/run_process.py index da53c62a..66484109 100644 --- a/src/mmore/run_process.py +++ b/src/mmore/run_process.py @@ -44,11 +44,19 @@ class ProcessInference: previous_results: Optional[str] = None +def merged_results_path(output_path: str) -> str: + """Path where `process` writes its final merged JSONL. + + Single source of truth for downstream tooling (TUI, scripts) that needs + to locate the JSONL produced by a `process` run from its config. + """ + return os.path.join(output_path, "merged", "merged_results.jsonl") + + def _write_merged_results(output_path, reused_samples, dispatched=True): """Merge per-processor JSONL files and reused samples into a single output.""" - merged_output_path = os.path.join(output_path, "merged") - output_file = os.path.join(merged_output_path, "merged_results.jsonl") - os.makedirs(merged_output_path, exist_ok=True) + output_file = merged_results_path(output_path) + os.makedirs(os.path.dirname(output_file), exist_ok=True) total_results = 0 with open(output_file, "w") as f: diff --git a/src/mmore/tui/__init__.py b/src/mmore/tui/__init__.py new file mode 100644 index 00000000..3004c7fb --- /dev/null +++ b/src/mmore/tui/__init__.py @@ -0,0 +1,3 @@ +from mmore.tui.app import run + +__all__ = ["run"] diff --git a/src/mmore/tui/app.py b/src/mmore/tui/app.py new file mode 100644 index 00000000..37186654 --- /dev/null +++ b/src/mmore/tui/app.py @@ -0,0 +1,304 @@ +"""mmore TUI entry point.""" + +from __future__ import annotations + +import threading + +import questionary +from rich.panel import Panel +from rich.text import Text + +from mmore.tui.commands import REGISTRY, check_stage_available +from mmore.tui.config_builder import ( + build_full_pipeline_wizard, + pick_or_build_config, +) +from mmore.tui.exceptions import UserCancelledError +from mmore.tui.paths import cwd_default +from mmore.tui.pipeline import run_full_pipeline, run_pipeline_with_configs +from mmore.tui.theme import ( + ACCENT, + ACCENT2, + MUTED, + OK, + QMARK, + QSTYLE, + console, + run_step, + section, + show_banner, +) + +_PIPELINE_STAGES = ("process", "postprocess", "index") + + +def _warm_pipeline_dataclasses() -> None: + """Pre-load process/postprocess/index dataclasses in a daemon thread. + + Called when entering the wizard or full-pipeline flows, where several YAML + validations happen back-to-back. The import cost then overlaps with the + wizard's own prompts. Daemon = no impact on exit. Stages whose canary + imports are missing are skipped so partial installs don't crash the warm-up. + """ + + def _warm() -> None: + for stage in _PIPELINE_STAGES: + spec = REGISTRY[stage] + if check_stage_available(spec) is not None or spec.config_dataclass is None: + continue + try: + spec.config_dataclass() + except Exception: # noqa: BLE001 + pass + + threading.Thread(target=_warm, daemon=True).start() + + +def _show_missing_extras(spec_name: str, hint: str) -> None: + console.print( + Panel( + Text.assemble( + (f"Stage `{spec_name}` can't run.\n\n", "bold"), + (hint, "yellow"), + ), + title="[bold yellow]missing dependencies[/]", + border_style="yellow", + padding=(1, 2), + ) + ) + + +def _missing_extras_notice() -> Panel | None: + """One-line-per-install-command notice β€” kept compact so the banner stays visible.""" + install_to_stages: dict[str, list[str]] = {} + for name, spec in REGISTRY.items(): + hint = check_stage_available(spec) + if hint and "Install with: " in hint: + cmd = hint.split("Install with: ", 1)[1].strip() + install_to_stages.setdefault(cmd, []).append(name) + + if not install_to_stages: + return None + + body = Text() + for i, (cmd, stages) in enumerate(install_to_stages.items()): + if i > 0: + body.append("\n") + body.append(", ".join(stages), style="bold white") + body.append(" β†’ ", style="yellow") + body.append(cmd, style="cyan") + + return Panel( + body, + title="[bold yellow]⚠ missing extras[/]", + border_style="yellow", + padding=(0, 1), + ) + + +def _disabled_label(label: str) -> str: + """Prefix a menu label so its disabled state is immediately readable.""" + return f"⚠ {label}" + + +def _run_single_command() -> None: + choices = [] + enabled_count = 0 + for spec in REGISTRY.values(): + hint = check_stage_available(spec) + label = f"{spec.name:<12} β€” {spec.description}" + if hint: + choices.append( + questionary.Choice( + _disabled_label(label), value=spec.name, disabled="missing extras" + ) + ) + else: + choices.append(questionary.Choice(label, value=spec.name)) + enabled_count += 1 + + # questionary crashes ("InquirerControl has no attribute 'pointed_at'") when + # every choice is disabled because it can't pick an initial pointer. Bail + # out with a clear notice instead. + if enabled_count == 0: + notice = _missing_extras_notice() + if notice is not None: + console.print(notice) + return + + name = questionary.select( + "Pick a command", + choices=choices, + style=QSTYLE, + qmark=QMARK, + ).ask() + if name is None: + return + spec = REGISTRY[name] + # Defensive re-check in case the user typed past the disabled state. + hint = check_stage_available(spec) + if hint: + _show_missing_extras(spec.name, hint) + return + config_file = pick_or_build_config(spec) + kwargs = {"config_file": config_file} + if spec.needs_input_data: + input_data = questionary.text( + "Input JSONL path", + default=cwd_default("outputs/process/merged/merged_results.jsonl"), + style=QSTYLE, + qmark=QMARK, + ).ask() + if input_data is None: + return + kwargs["input_data"] = input_data + + console.print() + console.print( + section( + f"Running {name}", + Text(f"config: {config_file}", style=MUTED), + style=ACCENT2, + ) + ) + interactive = name in {"ragcli", "retrieve", "rag"} + if interactive: + spec.run(**kwargs) + else: + run_step(spec.description, spec.run, **kwargs) + console.print(f"[{OK}]βœ“ {name} finished[/]") + + +def _chat_only() -> None: + config_file = pick_or_build_config(REGISTRY["ragcli"]) + console.print() + console.print(section("RAG chat", Text(f"config: {config_file}", style=MUTED))) + REGISTRY["ragcli"].run(config_file=config_file) + + +def _run_full_wizard() -> None: + _warm_pipeline_dataclasses() + paths = build_full_pipeline_wizard() + console.print() + console.print( + section( + "Wizard complete", + Text( + "process: " + paths["process"] + "\n" + "postprocess: " + paths["postprocess"] + "\n" + "index: " + paths["index"], + style=MUTED, + ), + style=ACCENT2, + ) + ) + if questionary.confirm( + "Run the pipeline now with these configs?", + default=True, + style=QSTYLE, + qmark=QMARK, + ).ask(): + run_pipeline_with_configs( + paths["process"], paths["postprocess"], paths["index"] + ) + + +def _pipeline_hint() -> str | None: + """Return a combined hint if any of process/postprocess/index is missing.""" + hints = [ + check_stage_available(REGISTRY[s]) for s in ("process", "postprocess", "index") + ] + hints = [h for h in hints if h] + return " | ".join(hints) if hints else None + + +def _main_menu() -> str | None: + notice = _missing_extras_notice() + if notice is not None: + console.print(notice) + + pipeline_hint = _pipeline_hint() + chat_hint = check_stage_available(REGISTRY["ragcli"]) + # The wizard validates each generated YAML against the stage's dataclass, + # which transitively imports torch / transformers / etc. β€” so it needs the + # same extras as the full pipeline. Reuse `_pipeline_hint()` to stay aligned. + wizard_hint = _pipeline_hint() + + pipeline_label = "πŸš€ Run full pipeline (process β†’ postprocess β†’ index)" + wizard_label = "πŸ§™ Build a full pipeline config (guided wizard)" + chat_label = "πŸ’¬ Chat with indexed documents" + + pipeline_choice = questionary.Choice( + _disabled_label(pipeline_label) if pipeline_hint else pipeline_label, + value="pipeline", + disabled="missing extras" if pipeline_hint else None, + ) + wizard_choice = questionary.Choice( + _disabled_label(wizard_label) if wizard_hint else wizard_label, + value="wizard", + disabled="missing extras" if wizard_hint else None, + ) + chat_choice = questionary.Choice( + _disabled_label(chat_label) if chat_hint else chat_label, + value="chat", + disabled="missing extras" if chat_hint else None, + ) + + return questionary.select( + "What do you want to do?", + choices=[ + questionary.Choice("βš™ Run a single command", value="single"), + pipeline_choice, + wizard_choice, + chat_choice, + questionary.Separator(), + questionary.Choice("πŸ”§ Setup (install dependencies)", value="setup"), + questionary.Choice("βœ• Quit", value="quit"), + ], + style=QSTYLE, + qmark=QMARK, + ).ask() + + +def run() -> None: + console.clear() + show_banner("interactive launcher") + while True: + # Ctrl-C at the main menu itself quits; inside any sub-flow it + # cancels and returns here. + try: + mode = _main_menu() + except KeyboardInterrupt: + console.print(f"\n[{ACCENT}]bye![/]") + return + if mode in (None, "quit"): + console.print(f"[{ACCENT}]bye![/]") + return + + try: + if mode == "single": + _run_single_command() + elif mode == "pipeline": + _warm_pipeline_dataclasses() + run_full_pipeline() + elif mode == "wizard": + _run_full_wizard() + elif mode == "chat": + _chat_only() + elif mode == "setup": + from mmore.tui.setup import run_setup_wizard + + run_setup_wizard() + except (UserCancelledError, KeyboardInterrupt): + console.print(f"[{ACCENT2}]cancelled β€” back to menu.[/]") + continue + except Exception as e: # noqa: BLE001 + console.print(f"[bold red]error:[/] {e}") + try: + cont = questionary.confirm( + "Continue?", default=True, style=QSTYLE + ).ask() + except KeyboardInterrupt: + return + if not cont: + return diff --git a/src/mmore/tui/commands.py b/src/mmore/tui/commands.py new file mode 100644 index 00000000..650cbf6d --- /dev/null +++ b/src/mmore/tui/commands.py @@ -0,0 +1,211 @@ +"""Registry of mmore commands callable from the TUI. + +Each entry mirrors a Click command in `mmore.cli` so the TUI is a thin wrapper: +the `run` callable is the same `run_*` function the CLI uses. +""" + +import importlib.util +from dataclasses import dataclass, field +from typing import Any, Callable, Optional + + +@dataclass +class CommandSpec: + name: str + description: str + example_config: Optional[str] + run: Callable[..., None] + needs_input_data: bool = False + config_globs: list[str] = field(default_factory=list) + # Lazy importer returning the dataclass to validate YAML against. + # Returns None if no validation is wired up for this stage. + config_dataclass: Optional[Callable[[], Any]] = None + # Extras the user has to `uv sync --extra ...` for this stage to import. + # Used only to build a friendly install hint. + required_extras: list[str] = field(default_factory=list) + # Module names probed via `importlib.util.find_spec` to verify the extras + # are actually installed. If any is missing, the stage is disabled in the + # menu with an install hint. + canary_imports: list[str] = field(default_factory=list) + + +def check_stage_available(spec: "CommandSpec") -> Optional[str]: + """Return None if all canary imports resolve, else an install-hint string.""" + missing: list[str] = [] + for mod in spec.canary_imports: + try: + if importlib.util.find_spec(mod) is None: + missing.append(mod) + except (ImportError, ValueError): + missing.append(mod) + if not missing: + return None + extras = " ".join(f"--extra {e}" for e in spec.required_extras) + return f"Missing: {', '.join(missing)}. Install with: uv sync {extras}".strip() + + +def _process(config_file: str, **_): + from mmore.run_process import process + + process(config_file) + + +def _postprocess(config_file: str, input_data: str, **_): + from mmore.run_postprocess import postprocess + + postprocess(config_file, input_data) + + +def _index( + config_file: str, + documents_path: Optional[str] = None, + collection_name: Optional[str] = None, + **_, +): + from mmore.run_index import index + + index(config_file, documents_path, collection_name) + + +def _retrieve(config_file: str, **_): + from mmore.run_retriever import run_api + + run_api(config_file, "0.0.0.0", 8001) + + +def _rag(config_file: str, **_): + from mmore.run_rag import rag + + rag(config_file) + + +def _ragcli(config_file: str, **_): + from mmore.run_ragcli import RagCLI + + RagCLI(config_file).launch_cli() + + +def _websearch(config_file: str, **_): + from mmore.run_websearch import run_websearch + + run_websearch(config_file) + + +# Lazy dataclass importers β€” keeps heavy deps out of TUI startup. +def _dc_process(): + from mmore.run_process import ProcessInference + + return ProcessInference + + +def _dc_postprocess(): + from mmore.process.post_processor.pipeline import PPPipelineConfig + + return PPPipelineConfig + + +def _dc_index(): + from mmore.run_index import IndexConfig + + return IndexConfig + + +def _dc_rag(): + from mmore.run_rag import RAGInferenceConfig + + return RAGInferenceConfig + + +REGISTRY: dict[str, CommandSpec] = { + "process": CommandSpec( + name="process", + description="Crawl + extract documents into a JSONL", + example_config="examples/process/config.yaml", + run=_process, + config_globs=[ + "examples/process/**/*.yaml", + "examples/process/**/*.yml", + ], + config_dataclass=_dc_process, + required_extras=["process", "cpu"], + canary_imports=["torch", "marker", "transformers"], + ), + "postprocess": CommandSpec( + name="postprocess", + description="Chunk / clean processed documents", + example_config="examples/postprocessor/config.yaml", + run=_postprocess, + needs_input_data=True, + config_globs=[ + "examples/postprocessor/**/*.yaml", + "examples/postprocessor/**/*.yml", + ], + config_dataclass=_dc_postprocess, + required_extras=["process", "cpu"], + canary_imports=["torch", "transformers"], + ), + "index": CommandSpec( + name="index", + description="Embed + store documents in Milvus", + example_config="examples/index/config.yaml", + run=_index, + config_globs=[ + "examples/index/**/*.yaml", + "examples/index/**/*.yml", + ], + config_dataclass=_dc_index, + required_extras=["index", "cpu"], + canary_imports=["pymilvus", "sentence_transformers", "torch"], + ), + "retrieve": CommandSpec( + name="retrieve", + description="Run retriever API server", + example_config="examples/rag/config.yaml", + run=_retrieve, + config_globs=[ + "examples/rag/**/*.yaml", + "examples/rag/**/*.yml", + ], + config_dataclass=_dc_rag, + required_extras=["rag", "api", "cpu"], + canary_imports=["fastapi", "pymilvus", "torch"], + ), + "rag": CommandSpec( + name="rag", + description="Run a one-shot RAG pipeline", + example_config="examples/rag/config.yaml", + run=_rag, + config_globs=[ + "examples/rag/**/*.yaml", + "examples/rag/**/*.yml", + ], + config_dataclass=_dc_rag, + required_extras=["rag", "cpu"], + canary_imports=["langchain", "pymilvus", "torch"], + ), + "ragcli": CommandSpec( + name="ragcli", + description="Interactive RAG chat", + example_config="examples/rag/config.yaml", + run=_ragcli, + config_globs=[ + "examples/rag/**/*.yaml", + "examples/rag/**/*.yml", + ], + config_dataclass=_dc_rag, + required_extras=["rag", "cpu"], + canary_imports=["langchain", "pymilvus", "torch"], + ), + "websearch": CommandSpec( + name="websearch", + description="Web search (+ optional RAG)", + example_config="examples/websearchRAG/config.yaml", + run=_websearch, + config_globs=[ + "examples/websearchRAG/**/*.yaml", + "examples/websearchRAG/**/*.yml", + ], + required_extras=["websearch"], + canary_imports=["ddgs"], + ), +} diff --git a/src/mmore/tui/config_builder.py b/src/mmore/tui/config_builder.py new file mode 100644 index 00000000..6c54c6a0 --- /dev/null +++ b/src/mmore/tui/config_builder.py @@ -0,0 +1,836 @@ +"""Generate YAML config files via guided prompts. + +Templates here mirror the example configs under `examples/`. The user is +asked only for the fields most likely to change between runs; everything else +falls back to the example defaults. The resulting dict is dumped to a YAML +file under `./tui-configs/`. +""" + +from __future__ import annotations + +import os +import shlex +import subprocess +import time +from pathlib import Path +from typing import Any, Optional + +import questionary +import yaml +from rich.live import Live +from rich.panel import Panel +from rich.spinner import Spinner +from rich.syntax import Syntax +from rich.text import Text + +from mmore.tui.commands import CommandSpec +from mmore.tui.exceptions import UserCancelledError +from mmore.tui.paths import cwd_default, repo_root, resolve_example +from mmore.tui.theme import ACCENT, ACCENT2, QMARK, QSTYLE, console, section + + +def _ask(prompt_obj: Any) -> Any: + """Call .ask() and translate Ctrl-C / Esc into UserCancelledError. + + questionary raises KeyboardInterrupt on Ctrl-C and returns None on Esc. + Both should land us back at the main menu, not exit the TUI. + """ + try: + answer = prompt_obj.ask() + except KeyboardInterrupt as e: + raise UserCancelledError("cancelled") from e + if answer is None: + raise UserCancelledError("cancelled") + return answer + + +CONFIG_DIR = Path("./tui-configs") + + +def _prompt(question: str, default: str = "") -> str: + return _ask(questionary.text(question, default=default, style=QSTYLE, qmark=QMARK)) + + +def _confirm(question: str, default: bool = False) -> bool: + return _ask( + questionary.confirm(question, default=default, style=QSTYLE, qmark=QMARK) + ) + + +def _prompt_int(question: str, default: int) -> int: + try: + return int(_prompt(question, str(default))) + except ValueError: + return default + + +def _prompt_float(question: str, default: float) -> float: + try: + return float(_prompt(question, str(default))) + except ValueError: + return default + + +def _save(name: str, data: dict[str, Any]) -> str: + CONFIG_DIR.mkdir(parents=True, exist_ok=True) + path = CONFIG_DIR / f"{name}-{time.time_ns()}.yaml" + with open(path, "w") as f: + yaml.safe_dump(data, f, sort_keys=False) + return str(path) + + +def _preview_config(path: str) -> None: + """Display a YAML file with syntax highlighting.""" + content = Path(path).read_text() + console.print( + Panel( + Syntax(content, "yaml", theme="monokai", line_numbers=True), + title=f"[bold]{path}[/bold]", + border_style=ACCENT, + padding=(1, 2), + ) + ) + + +def _edit_config(path: str) -> None: + """Open a config file in $EDITOR (falls back to vi). + + Supports editors with flags like ``EDITOR="code -w"`` via shlex.split. + """ + editor = os.environ.get("EDITOR", "vi") + subprocess.call([*shlex.split(editor), path]) + + +def _post_validation_menu(path: str, spec: CommandSpec) -> str: + """After validation, let the user preview, edit, or run the config. + + Returns the (potentially re-validated) path. + """ + while True: + action = _ask( + questionary.select( + "What next?", + choices=[ + questionary.Choice("β–Ά Run with this config", value="run"), + questionary.Choice("πŸ‘ Preview config", value="preview"), + questionary.Choice("✎ Edit in $EDITOR", value="edit"), + ], + default="run", + style=QSTYLE, + qmark=QMARK, + ) + ) + if action == "run": + return path + if action == "preview": + _preview_config(path) + continue + if action == "edit": + _edit_config(path) + err = _validate_with_spinner(path, spec) + if err: + _show_error_panel(path, err) + continue + return path # unreachable but keeps mypy happy + + +def build_process_config() -> str: + data_path = _prompt( + "Data path (folder with documents to process)", + cwd_default("data"), + ) + output_path = _prompt( + "Output path (where merged_results.jsonl will be written)", + cwd_default("outputs/process"), + ) + use_fast = _confirm("Use fast (lower-quality) processors?", default=False) + distributed = _confirm("Use distributed processing (Dask)?", default=False) + extract_images = _confirm("Extract images from documents?", default=True) + + cfg = { + "data_path": data_path, + "google_drive_ids": [], + "previous_results": None, + "dispatcher_config": { + "output_path": output_path, + "use_fast_processors": use_fast, + "distributed": distributed, + "extract_images": extract_images, + "scheduler_file": None, + "process_batch_sizes": [ + {"URLProcessor": 40}, + {"DOCXProcessor": 100}, + {"PDFProcessor": 4000}, + {"MediaProcessor": 40}, + {"SpreadsheetProcessor": 100}, + {"TXTProcessor": 100}, + {"PPTXProcessor": 100}, + {"MarkdownProcessor": 100}, + {"EMLProcessor": 100}, + {"HTMLProcessor": 100}, + ], + "processor_config": { + "MediaProcessor": [ + {"normal_model": "openai/whisper-large-v3-turbo"}, + {"fast_model": "openai/whisper-tiny"}, + {"type": "automatic-speech-recognition"}, + {"sample_rate": 10}, + {"batch_size": 4}, + ], + "PDFProcessor": [ + {"PDFTEXT_CPU_WORKERS": 0}, + {"DETECTOR_BATCH_SIZE": 1}, + {"DETECTOR_POSTPROCESSING_CPU_WORKERS": 0}, + {"RECOGNITION_BATCH_SIZE": 1}, + {"OCR_PARALLEL_WORKERS": 0}, + {"TEXIFY_BATCH_SIZE": 1}, + {"LAYOUT_BATCH_SIZE": 1}, + {"ORDER_BATCH_SIZE": 1}, + {"TABLE_REC_BATCH_SIZE": 1}, + ], + }, + }, + } + return _save("process", cfg) + + +def build_postprocess_config() -> str: + strategy = _ask( + questionary.select( + "Chunking strategy", + choices=["sentence", "token", "word", "semantic"], + default="sentence", + style=QSTYLE, + qmark=QMARK, + ) + ) + table_handling = _ask( + questionary.select( + "Table handling", + choices=["single_row", "multi_rows", "keep_whole", "none"], + default="single_row", + style=QSTYLE, + qmark=QMARK, + ) + ) + output_path = _prompt( + "Output JSONL path", + cwd_default("outputs/postprocess/results.jsonl"), + ) + + cfg = { + "previous_results": None, + "pp_modules": [ + { + "type": "chunker", + "args": { + "chunking_strategy": strategy, + "table_handling": table_handling, + }, + }, + ], + "output": {"output_path": output_path, "save_each_step": True}, + } + return _save("postprocess", cfg) + + +def build_index_config(documents_path: Optional[str] = None) -> str: + dense = _prompt("Dense embedding model", "sentence-transformers/all-MiniLM-L6-v2") + sparse = _prompt("Sparse embedding model", "splade") + db_uri = _prompt( + "DB URI (Milvus Lite file or server URL)", cwd_default("proc_demo.db") + ) + db_name = _prompt("DB name", "my_db") + collection = _prompt("Collection name", "my_docs") + docs = documents_path or _prompt( + "Documents JSONL path", + cwd_default("outputs/postprocess/results.jsonl"), + ) + cfg = { + "indexer": { + "dense_model": {"model_name": dense, "is_multimodal": False}, + "sparse_model": {"model_name": sparse, "is_multimodal": False}, + "db": {"uri": db_uri, "name": db_name}, + }, + "collection_name": collection, + "documents_path": docs, + } + return _save("index", cfg) + + +def build_rag_config() -> str: + """Wizard for `rag` / `retrieve` / `ragcli` configs.""" + llm_name = _prompt("LLM name", "OpenMeditron/meditron3-8b") + max_new_tokens = _prompt_int("Max new tokens", 1200) + + db_uri = _prompt( + "DB URI (Milvus Lite file or server URL)", cwd_default("proc_demo.db") + ) + db_name = _prompt("DB name", "my_db") + collection = _prompt("Collection name", "my_docs") + k = _prompt_int("Number of docs to retrieve (k)", 5) + hybrid = _prompt_float("Hybrid search weight (0.0 dense β€” 1.0 sparse)", 0.5) + use_web = _confirm("Augment retrieval with web search?", default=False) + reranker = _prompt("Reranker model (blank to skip)", "BAAI/bge-reranker-base") + + mode = _ask( + questionary.select( + "Run mode", + choices=["local", "api"], + default="local", + style=QSTYLE, + qmark=QMARK, + ) + ) + + cfg: dict[str, Any] = { + "rag": { + "llm": {"llm_name": llm_name, "max_new_tokens": max_new_tokens}, + "retriever": { + "db": {"uri": db_uri, "name": db_name}, + "hybrid_search_weight": hybrid, + "k": k, + "collection_name": collection, + "use_web": use_web, + "reranker_model_name": reranker or None, + }, + "system_prompt": ( + "Use the following context to answer the questions.\n\n" + "Context:\n{context}" + ), + }, + "mode": mode, + } + if mode == "local": + input_file = _prompt( + "Queries JSONL path", resolve_example("examples/rag/queries.jsonl") + ) + output_file = _prompt( + "Output JSON path", cwd_default("outputs/rag/output.json") + ) + cfg["mode_args"] = {"input_file": input_file, "output_file": output_file} + else: + port = _prompt_int("API port", 8000) + cfg["mode_args"] = { + "endpoint": "/rag", + "host": "0.0.0.0", + "port": port, + } + return _save("rag", cfg) + + +def build_websearch_config() -> str: + """Wizard for `websearch` configs.""" + use_rag = _confirm("Combine web search with RAG?", default=True) + rag_path = "" + if use_rag: + rag_path = _prompt( + "Path to a RAG config YAML", + resolve_example("examples/rag/config.yaml"), + ) + llm_name = _prompt("LLM name", "OpenMeditron/meditron3-8b") + max_new_tokens = _prompt_int("Max new tokens", 1200) + input_queries = _prompt( + "Input queries JSONL", resolve_example("examples/rag/queries.jsonl") + ) + output_file = _prompt( + "Output JSON path", + cwd_default("outputs/websearch/enhanced_results.json"), + ) + n_subqueries = _prompt_int("Number of sub-queries per question", 2) + max_searches = _prompt_int("Max searches per query", 5) + provider = _ask( + questionary.select( + "Search provider", + choices=["duckduckgo"], + default="duckduckgo", + style=QSTYLE, + qmark=QMARK, + ) + ) + + cfg: dict[str, Any] = { + "websearch": { + "use_rag": use_rag, + "rag_config_path": rag_path, + "use_summary": True, + "n_subqueries": n_subqueries, + "input_queries": input_queries, + "output_file": output_file, + "n_loops": 2, + "max_searches": max_searches, + "search_provider": provider, + "max_retries": 3, + "max_context_tokens": 2048, + "fast_tokenizer": False, + "mode": "local", + "llm_config": { + "llm_name": llm_name, + "max_new_tokens": max_new_tokens, + }, + } + } + return _save("websearch", cfg) + + +BUILDERS = { + "process": build_process_config, + "postprocess": build_postprocess_config, + "index": build_index_config, + "rag": build_rag_config, + "retrieve": build_rag_config, + "ragcli": build_rag_config, + "websearch": build_websearch_config, +} + + +# Static list of processor class names β€” kept in sync with +# src/mmore/process/processors/*.py. Used by the full-pipeline wizard so the +# user can pick a subset rather than always shipping all 10. +_ALL_PROCESSORS: list[tuple[str, int]] = [ + ("PDFProcessor", 4000), + ("DOCXProcessor", 100), + ("PPTXProcessor", 100), + ("MarkdownProcessor", 100), + ("HTMLProcessor", 100), + ("TXTProcessor", 100), + ("EMLProcessor", 100), + ("SpreadsheetProcessor", 100), + ("MediaProcessor", 40), + ("URLProcessor", 40), +] + +_PROCESSOR_DEFAULT_CONFIG: dict[str, list[dict[str, Any]]] = { + "MediaProcessor": [ + {"normal_model": "openai/whisper-large-v3-turbo"}, + {"fast_model": "openai/whisper-tiny"}, + {"type": "automatic-speech-recognition"}, + {"sample_rate": 10}, + {"batch_size": 4}, + ], + "PDFProcessor": [ + {"PDFTEXT_CPU_WORKERS": 0}, + {"DETECTOR_BATCH_SIZE": 1}, + {"DETECTOR_POSTPROCESSING_CPU_WORKERS": 0}, + {"RECOGNITION_BATCH_SIZE": 1}, + {"OCR_PARALLEL_WORKERS": 0}, + {"TEXIFY_BATCH_SIZE": 1}, + {"LAYOUT_BATCH_SIZE": 1}, + {"ORDER_BATCH_SIZE": 1}, + {"TABLE_REC_BATCH_SIZE": 1}, + ], +} + + +def build_process_config_wizard() -> str: + """Richer process-config builder that lets the user pick processors.""" + data_path = _prompt( + "Data path (folder with documents to process)", cwd_default("data") + ) + output_path = _prompt( + "Output path (where merged_results.jsonl will be written)", + cwd_default("outputs/process"), + ) + use_fast = _confirm("Use fast (lower-quality) processors?", default=False) + distributed = _confirm("Use distributed processing (Dask)?", default=False) + extract_images = _confirm("Extract images from documents?", default=True) + + names = [n for n, _ in _ALL_PROCESSORS] + selected = _ask( + questionary.checkbox( + "Select processors to enable", + choices=[questionary.Choice(n, value=n, checked=True) for n in names], + style=QSTYLE, + qmark=QMARK, + ) + ) + if not selected: + selected = names # empty would mean a no-op pipeline; fall back to all + + customize = _confirm("Customize batch sizes?", default=False) + sizes: list[dict[str, int]] = [] + for name, default in _ALL_PROCESSORS: + if name not in selected: + continue + value = _prompt_int(f"Batch size for {name}", default) if customize else default + sizes.append({name: value}) + + processor_config = { + name: cfg for name, cfg in _PROCESSOR_DEFAULT_CONFIG.items() if name in selected + } + + # Incremental resume: detect previous results + from mmore.run_process import merged_results_path + + previous_results = None + prev_path = merged_results_path(output_path) + if os.path.exists(prev_path) and _confirm( + f"Previous results found at {prev_path}. Resume (skip unchanged files)?", + default=True, + ): + previous_results = prev_path + + cfg = { + "data_path": data_path, + "google_drive_ids": [], + "previous_results": previous_results, + "dispatcher_config": { + "output_path": output_path, + "use_fast_processors": use_fast, + "distributed": distributed, + "extract_images": extract_images, + "scheduler_file": None, + "process_batch_sizes": sizes, + "processor_config": processor_config, + }, + } + return _save("process", cfg) + + +def _postprocessor_choices() -> list[str]: + """Enumerate every post-processor `type` string the loader accepts. + + The wizard is reachable without the `process` extra installed (it only + writes YAML), so we fall back to the core set if the extra modules are + missing instead of crashing mid-wizard with an ImportError. + """ + base = ["chunker", "ner", "translator", "metafuse"] + try: + from mmore.process.post_processor.filter import FILTER_TYPES + from mmore.process.post_processor.tagger import TAGGER_TYPES + except ImportError: + return base + return [*base, *TAGGER_TYPES, *FILTER_TYPES] + + +def _ask_module_args(pp_type: str) -> dict[str, Any]: + if pp_type == "chunker": + strategy = _ask( + questionary.select( + "Chunking strategy", + choices=["sentence", "token", "word", "semantic"], + default="sentence", + style=QSTYLE, + qmark=QMARK, + ) + ) + table_handling = _ask( + questionary.select( + "Table handling", + choices=["single_row", "multi_rows", "keep_whole", "none"], + default="single_row", + style=QSTYLE, + qmark=QMARK, + ) + ) + return { + "chunking_strategy": strategy, + "table_handling": table_handling, + } + if pp_type in {"ner", "translator", "metafuse"}: + if _confirm(f"Provide extra args for `{pp_type}` as YAML?", default=False): + raw = _prompt("YAML args (single line, e.g. {key: value})", "{}") + try: + parsed = yaml.safe_load(raw) or {} + if isinstance(parsed, dict): + return parsed + except yaml.YAMLError: + pass + return {} + return {} + + +def build_postprocess_config_wizard() -> str: + """Build a postprocess config with an arbitrary list of pp_modules.""" + available = _postprocessor_choices() + modules: list[dict[str, Any]] = [] + while True: + if modules: + console.print( + f" [dim]current modules:[/] {', '.join(m['type'] for m in modules)}" + ) + pp_type = _ask( + questionary.select( + "Add a post-processor module" if not modules else "Add another module", + choices=[*available, questionary.Separator(), "(done)"], + style=QSTYLE, + qmark=QMARK, + ) + ) + if pp_type == "(done)": + break + args = _ask_module_args(pp_type) + modules.append({"type": pp_type, "args": args}) + + output_path = _prompt( + "Output JSONL path", + cwd_default("outputs/postprocess/results.jsonl"), + ) + + # Incremental resume: detect previous results + previous_results = None + # Resolve the actual JSONL path (dir β†’ dir/final.jsonl, .jsonl β†’ as-is) + if output_path.endswith(".jsonl"): + pp_prev_path = output_path + else: + pp_prev_path = os.path.join(output_path, "final.jsonl") + if os.path.exists(pp_prev_path) and _confirm( + f"Previous results found at {pp_prev_path}. Resume (skip unchanged)?", + default=True, + ): + previous_results = pp_prev_path + + cfg = { + "previous_results": previous_results, + "pp_modules": modules, + "output": {"output_path": output_path, "save_each_step": True}, + } + return _save("postprocess", cfg) + + +def build_index_config_wizard(documents_path: Optional[str] = None) -> str: + dense = _prompt("Dense embedding model", "sentence-transformers/all-MiniLM-L6-v2") + sparse = _prompt("Sparse embedding model", "splade") + multimodal = _confirm("Multimodal embeddings?", default=False) + db_uri = _prompt( + "DB URI (Milvus Lite file or server URL)", cwd_default("proc_demo.db") + ) + db_name = _prompt("DB name", "my_db") + collection = _prompt("Collection name", "my_docs") + docs = documents_path or _prompt( + "Documents JSONL path", + cwd_default("outputs/postprocess/results.jsonl"), + ) + cfg = { + "indexer": { + "dense_model": {"model_name": dense, "is_multimodal": multimodal}, + "sparse_model": {"model_name": sparse, "is_multimodal": multimodal}, + "db": {"uri": db_uri, "name": db_name}, + }, + "collection_name": collection, + "documents_path": docs, + } + return _save("index", cfg) + + +def build_full_pipeline_wizard() -> dict[str, str]: + """Build process + postprocess + index configs in one flow. + + Wires the postprocess output JSONL into the index config's documents_path + so the three files form a coherent pipeline. Validates each YAML and + re-prompts on failure (the per-stage builders run again on retry). + """ + from mmore.tui.commands import REGISTRY + from mmore.tui.pipeline import _postprocess_output_jsonl + + console.print(section("Pipeline wizard", Text("step 1/3 β€” process", style=ACCENT2))) + while True: + process_path = build_process_config_wizard() + err = _validate_with_spinner(process_path, REGISTRY["process"]) + if err is None: + break + _show_error_panel(process_path, err) + if not _confirm("Retry the process step?", default=True): + raise UserCancelledError("cancelled") + + console.print( + section("Pipeline wizard", Text("step 2/3 β€” postprocess", style=ACCENT2)) + ) + while True: + pp_path = build_postprocess_config_wizard() + err = _validate_with_spinner(pp_path, REGISTRY["postprocess"]) + if err is None: + break + _show_error_panel(pp_path, err) + if not _confirm("Retry the postprocess step?", default=True): + raise UserCancelledError("cancelled") + + try: + docs_jsonl = _postprocess_output_jsonl(pp_path) + except Exception: # noqa: BLE001 + docs_jsonl = None + + console.print(section("Pipeline wizard", Text("step 3/3 β€” index", style=ACCENT2))) + while True: + index_path = build_index_config_wizard(documents_path=docs_jsonl) + err = _validate_with_spinner(index_path, REGISTRY["index"]) + if err is None: + break + _show_error_panel(index_path, err) + if not _confirm("Retry the index step?", default=True): + raise UserCancelledError("cancelled") + + return {"process": process_path, "postprocess": pp_path, "index": index_path} + + +def find_yaml_configs(spec: CommandSpec) -> list[str]: + """Find candidate YAML configs scoped to this stage. + + Globs are evaluated against the resolved repo root (looked up by walking + up from CWD), so the TUI works from any working directory. Generated + configs in `./tui-configs/` (CWD-relative) are always included so users + keep access to configs they just built. + """ + root = repo_root() or Path.cwd() + matches: list[str] = [] + for pattern in spec.config_globs: + for p in root.glob(pattern): + matches.append(str(p)) + generated = Path.cwd() / "tui-configs" + if generated.exists(): + for p in sorted(generated.glob(f"{spec.name}-*.yaml")): + matches.append(str(p)) + + seen: set[str] = set() + out: list[str] = [] + for m in matches: + if m not in seen: + seen.add(m) + out.append(m) + return out + + +def _validate_yaml(path: str, spec: CommandSpec) -> Optional[str]: + """Return None on success, an error message string on failure.""" + if spec.config_dataclass is None: + return None + try: + from mmore.utils import load_config + + dataclass_cls = spec.config_dataclass() + load_config(path, dataclass_cls) + return None + except Exception as e: # noqa: BLE001 + return f"{type(e).__name__}: {e}" + + +def _validate_with_spinner(path: str, spec: CommandSpec) -> Optional[str]: + """Same as _validate_yaml but shows a spinner β€” config dataclass imports + can take several seconds (heavy transitive imports), making the TUI look + frozen otherwise.""" + spinner = Spinner( + "dots", text=Text(f" Validating {spec.name} config…", style="cyan") + ) + result: dict[str, Optional[str]] = {} + with Live(spinner, console=console, refresh_per_second=12, transient=True): + result["err"] = _validate_yaml(path, spec) + return result["err"] + + +def _show_error_panel(path: str, err: str) -> None: + console.print( + Panel( + Text.assemble( + (f"{path}\n\n", "bold"), + (err, "red"), + ), + title="[bold red]invalid config[/]", + border_style="red", + padding=(1, 2), + ) + ) + + +def _ranked_choices(spec: CommandSpec, candidates: list[str]) -> list[Any]: + """Put `spec.example_config` first as β˜… recommended; rest under a separator.""" + choices: list[Any] = [] + rec_resolved: Optional[str] = None + if spec.example_config: + rec_resolved = resolve_example(spec.example_config) + rest = list(candidates) + if rec_resolved and rec_resolved in rest: + choices.append( + questionary.Choice(f"β˜… {rec_resolved} (recommended)", value=rec_resolved) + ) + rest.remove(rec_resolved) + elif rec_resolved and Path(rec_resolved).exists(): + choices.append( + questionary.Choice(f"β˜… {rec_resolved} (recommended)", value=rec_resolved) + ) + if rest: + if choices: + choices.append(questionary.Separator("── other configs ──")) + for c in rest: + choices.append(questionary.Choice(c, value=c)) + return choices + + +def pick_or_build_config( + spec: CommandSpec, documents_path: Optional[str] = None +) -> str: + """Ask the user to either pick an existing YAML or generate one. + + Validates the chosen YAML against the stage's dataclass and re-prompts + on failure rather than letting the run blow up later. + """ + while True: + choice = _ask( + questionary.select( + f"Config for `{spec.name}`?", + choices=[ + questionary.Choice("πŸ“‚ Pick existing YAML", value="pick"), + questionary.Choice("✨ Generate new YAML (guided)", value="build"), + questionary.Choice( + "✎ Edit an existing YAML in $EDITOR", value="edit" + ), + questionary.Choice("⌨ Type a path manually", value="manual"), + ], + style=QSTYLE, + qmark=QMARK, + ) + ) + + path: Optional[str] = None + + if choice in ("pick", "edit"): + candidates = find_yaml_configs(spec) + ranked = _ranked_choices(spec, candidates) + if not ranked: + questionary.print( + f"No YAML configs found for `{spec.name}`, " + "falling back to manual entry.", + style="fg:yellow", + ) + choice = "manual" + else: + picked = _ask( + questionary.select( + f"Select a config for `{spec.name}`", + choices=ranked, + style=QSTYLE, + qmark=QMARK, + ) + ) + path = picked + if choice == "edit": + _edit_config(path) + + if choice == "manual": + manual = _prompt("Path to YAML config") + manual = os.path.expandvars(os.path.expanduser(manual)) + if not os.path.exists(manual): + _show_error_panel(manual, "file not found") + continue + path = manual + + if choice == "build": + builder = BUILDERS.get(spec.name) + if builder is None: + questionary.print( + f"No guided builder for `{spec.name}` β€” pick an existing YAML.", + style="fg:yellow", + ) + continue + if spec.name == "index": + path = builder(documents_path=documents_path) # type: ignore[call-arg] + else: + path = builder() + + if path is None: + raise UserCancelledError("no config selected") + err = _validate_with_spinner(path, spec) + if err is None: + return _post_validation_menu(path, spec) + _show_error_panel(path, err) + if not _confirm("Try a different config?", default=True): + raise UserCancelledError("cancelled") diff --git a/src/mmore/tui/exceptions.py b/src/mmore/tui/exceptions.py new file mode 100644 index 00000000..eb310dae --- /dev/null +++ b/src/mmore/tui/exceptions.py @@ -0,0 +1,11 @@ +"""TUI-only exceptions.""" + +from __future__ import annotations + + +class UserCancelledError(Exception): + """Raised when the user cancels a sub-flow (Ctrl-C or Esc inside a prompt). + + Caught by the top-level menu loop so cancellation returns to the main menu + instead of exiting the whole TUI. + """ diff --git a/src/mmore/tui/inspector.py b/src/mmore/tui/inspector.py new file mode 100644 index 00000000..2d0dd033 --- /dev/null +++ b/src/mmore/tui/inspector.py @@ -0,0 +1,126 @@ +"""Lightweight JSONL inspector for TUI result previews. + +Streams the file line-by-line (no heavy imports like torch/transformers) +and prints a rich summary table + sample documents. +""" + +from __future__ import annotations + +import json +import os +from collections import Counter +from pathlib import Path +from typing import Any + +from rich.panel import Panel +from rich.table import Table +from rich.text import Text + +from mmore.tui.theme import ACCENT, ACCENT2, MUTED, console + + +def _iter_dicts(path: str): + """Yield raw dicts from a JSONL file without importing MultimodalSample.""" + with open(path) as f: + for line in f: + line = line.strip() + if line: + yield json.loads(line) + + +def inspect_jsonl(path: str, max_samples: int = 3) -> None: + """Print a summary of a JSONL file: counts, breakdowns, sample docs.""" + if not os.path.exists(path): + console.print(f" [dim]no output file at {path}[/dim]") + return + + total = 0 + processor_types: Counter[str] = Counter() + file_extensions: Counter[str] = Counter() + modality_types: Counter[str] = Counter() + total_text_len = 0 + samples: list[dict[str, Any]] = [] + + for doc in _iter_dicts(path): + total += 1 + + meta = doc.get("metadata", {}) + pt = meta.get("processor_type", "unknown") + processor_types[pt] += 1 + + fp = meta.get("file_path", "") + ext = Path(fp).suffix.lower() if fp else "(none)" + file_extensions[ext] += 1 + + text = doc.get("text", "") + if isinstance(text, str): + total_text_len += len(text) + + for mod in doc.get("modalities", []): + modality_types[mod.get("type", "unknown")] += 1 + + if len(samples) < max_samples: + samples.append(doc) + + if total == 0: + console.print(" [dim]empty JSONL (0 documents)[/dim]") + return + + # --- Stats table --- + table = Table( + title="[bold]Results summary[/bold]", + title_style=ACCENT2, + border_style=ACCENT, + header_style=f"bold {ACCENT}", + show_lines=False, + padding=(0, 2), + ) + table.add_column("Metric", style="bold") + table.add_column("Value") + + table.add_row("Total documents", str(total)) + table.add_row("Avg text length", f"{total_text_len // total:,} chars") + + if processor_types: + breakdown = ", ".join(f"{k}: {v}" for k, v in processor_types.most_common()) + table.add_row("Processor types", breakdown) + + if file_extensions: + breakdown = ", ".join(f"{k}: {v}" for k, v in file_extensions.most_common()) + table.add_row("File types", breakdown) + + if modality_types: + breakdown = ", ".join(f"{k}: {v}" for k, v in modality_types.most_common()) + table.add_row("Modalities", breakdown) + + console.print() + console.print(table) + + # --- Sample documents --- + if samples: + sample_text = Text() + for i, doc in enumerate(samples, 1): + meta = doc.get("metadata", {}) + fp = meta.get("file_path", "?") + pt = meta.get("processor_type", "?") + text = doc.get("text", "") + if isinstance(text, str): + preview = text[:200].replace("\n", " ") + if len(text) > 200: + preview += "…" + else: + preview = "(structured content)" + sample_text.append(f"#{i} ", style="bold") + sample_text.append(f"{fp} ") + sample_text.append(f"({pt})", style="dim") + sample_text.append("\n") + sample_text.append(preview + "\n\n", style=MUTED) + + console.print( + Panel( + sample_text, + title=f"[bold]Sample documents (first {len(samples)})[/bold]", + border_style=ACCENT, + padding=(1, 2), + ) + ) diff --git a/src/mmore/tui/paths.py b/src/mmore/tui/paths.py new file mode 100644 index 00000000..cb2594b6 --- /dev/null +++ b/src/mmore/tui/paths.py @@ -0,0 +1,48 @@ +"""Locate bundled example configs regardless of CWD. + +Strategy: +- Walk up from CWD looking for a directory that contains ``examples/`` + (works from any subdirectory of a source checkout). +- If nothing is found, return the original repo-relative path so error + messages stay readable; callers handle "missing" gracefully. +""" + +from __future__ import annotations + +import os +from pathlib import Path +from typing import Optional + + +def repo_root() -> Optional[Path]: + """Return a directory that contains an `examples/` folder, if any.""" + cwd = Path.cwd() + for candidate in [cwd, *cwd.parents]: + if (candidate / "examples").is_dir(): + return candidate + return None + + +def resolve_example(rel: str) -> str: + """Resolve an `examples/...` relative path to an absolute one. + + Falls back to the original string if no source checkout is found, so the + UI can still display it (and the validator will surface a clear error). + """ + root = repo_root() + if root is not None: + candidate = root / rel + if candidate.exists(): + return str(candidate) + return rel + + +def resolve_glob(pattern: str) -> tuple[Path, str]: + """Split a relative glob into (root, remaining-pattern) for Path.glob.""" + root = repo_root() or Path.cwd() + return root, pattern + + +def cwd_default(rel: str) -> str: + """A safe default path rooted at CWD (e.g. `./data` instead of `examples/...`).""" + return os.path.join(".", rel) diff --git a/src/mmore/tui/pipeline.py b/src/mmore/tui/pipeline.py new file mode 100644 index 00000000..025692fb --- /dev/null +++ b/src/mmore/tui/pipeline.py @@ -0,0 +1,174 @@ +"""Chain process -> postprocess -> index from the TUI.""" + +from __future__ import annotations + +import questionary +from rich.table import Table +from rich.text import Text + +from mmore.tui.commands import REGISTRY +from mmore.tui.config_builder import pick_or_build_config +from mmore.tui.inspector import inspect_jsonl +from mmore.tui.theme import ( + ACCENT, + ACCENT2, + MUTED, + console, + run_step, + section, + step_header, +) + + +def _process_output_jsonl(config_path: str) -> str: + """Resolve the JSONL path the `process` step writes to. + + Goes through `mmore.utils.load_config` so env-var expansion ($ROOT_OUT_DIR, + etc.) matches what the underlying command sees. + """ + from mmore.run_process import ProcessInference, merged_results_path + from mmore.utils import load_config + + cfg: ProcessInference = load_config(config_path, ProcessInference) + return merged_results_path(cfg.dispatcher_config.output_path) + + +def _postprocess_output_jsonl(config_path: str) -> str: + """Resolve the JSONL path `postprocess` writes to. + + Mirrors `PPPipeline`'s use of `mmore.process.utils.jsonl_path`: if the + configured `output_path` is a directory, the pipeline writes to + `/final.jsonl`; if it already ends in `.jsonl`, it's used as-is. + """ + from mmore.process.post_processor.pipeline import PPPipelineConfig + from mmore.process.utils import jsonl_path + from mmore.utils import load_config + + cfg: PPPipelineConfig = load_config(config_path, PPPipelineConfig) + return jsonl_path(cfg.output.output_path) + + +def _summary_table(rows: list[tuple[str, str, float]]) -> Table: + table = Table( + title="[bold]Pipeline summary[/bold]", + title_style=ACCENT2, + border_style=ACCENT, + header_style=f"bold {ACCENT}", + show_lines=False, + ) + table.add_column("Step", style="bold") + table.add_column("Output", style=MUTED) + table.add_column("Duration", justify="right") + total = 0.0 + for name, out, dur in rows: + table.add_row(name, out, f"{dur:.1f}s") + total += dur + table.add_section() + table.add_row("[bold]Total[/bold]", "", f"[bold]{total:.1f}s[/bold]") + return table + + +def run_pipeline_with_configs(process_cfg: str, pp_cfg: str, index_cfg: str) -> None: + """Execute the three stages given already-built YAML paths.""" + console.print() + console.print( + section( + "Full pipeline", + Text("process β†’ postprocess β†’ index β†’ (optional) chat", style=ACCENT), + style=ACCENT2, + ) + ) + + rows: list[tuple[str, str, float]] = [] + + step_header(1, 3, "process") + elapsed = run_step( + "Crawling + extracting documents", + REGISTRY["process"].run, + config_file=process_cfg, + ) + process_jsonl = _process_output_jsonl(process_cfg) + rows.append(("process", process_jsonl, elapsed)) + inspect_jsonl(process_jsonl) + + step_header(2, 3, "postprocess") + elapsed = run_step( + "Chunking + cleaning", + REGISTRY["postprocess"].run, + config_file=pp_cfg, + input_data=process_jsonl, + ) + pp_jsonl = _postprocess_output_jsonl(pp_cfg) + rows.append(("postprocess", pp_jsonl, elapsed)) + inspect_jsonl(pp_jsonl) + + step_header(3, 3, "index") + elapsed = run_step( + "Embedding + indexing into Milvus", + REGISTRY["index"].run, + config_file=index_cfg, + documents_path=pp_jsonl, + ) + rows.append(("index", "(vector DB)", elapsed)) + + console.print() + console.print(_summary_table(rows)) + console.print() + + if questionary.confirm("Open the RAG chat now?", default=True).ask(): + rag_cfg = pick_or_build_config(REGISTRY["ragcli"]) + REGISTRY["ragcli"].run(config_file=rag_cfg) + + +def run_full_pipeline() -> None: + console.print() + console.print( + section( + "Full pipeline", + Text("process β†’ postprocess β†’ index β†’ (optional) chat", style=ACCENT), + style=ACCENT2, + ) + ) + + rows: list[tuple[str, str, float]] = [] + + step_header(1, 3, "process") + process_cfg = pick_or_build_config(REGISTRY["process"]) + elapsed = run_step( + "Crawling + extracting documents", + REGISTRY["process"].run, + config_file=process_cfg, + ) + process_jsonl = _process_output_jsonl(process_cfg) + rows.append(("process", process_jsonl, elapsed)) + inspect_jsonl(process_jsonl) + + step_header(2, 3, "postprocess") + pp_cfg = pick_or_build_config(REGISTRY["postprocess"]) + elapsed = run_step( + "Chunking + cleaning", + REGISTRY["postprocess"].run, + config_file=pp_cfg, + input_data=process_jsonl, + ) + pp_jsonl = _postprocess_output_jsonl(pp_cfg) + rows.append(("postprocess", pp_jsonl, elapsed)) + inspect_jsonl(pp_jsonl) + + step_header(3, 3, "index") + index_cfg = pick_or_build_config(REGISTRY["index"], documents_path=pp_jsonl) + elapsed = run_step( + "Embedding + indexing into Milvus", + REGISTRY["index"].run, + config_file=index_cfg, + documents_path=pp_jsonl, + ) + rows.append(("index", "(vector DB)", elapsed)) + + console.print() + console.print(_summary_table(rows)) + console.print() + + if questionary.confirm("Open the RAG chat now?", default=True).ask(): + rag_cfg = pick_or_build_config(REGISTRY["ragcli"]) + REGISTRY["ragcli"].run(config_file=rag_cfg) diff --git a/src/mmore/tui/setup.py b/src/mmore/tui/setup.py new file mode 100644 index 00000000..beea7848 --- /dev/null +++ b/src/mmore/tui/setup.py @@ -0,0 +1,279 @@ +"""Setup wizard: install extras + print export commands in one guided flow.""" + +from __future__ import annotations + +import os +import subprocess +import sys + +import questionary +from rich.panel import Panel +from rich.table import Table +from rich.text import Text + +from mmore.tui.commands import REGISTRY, check_stage_available +from mmore.tui.config_builder import _ask, _confirm, _prompt +from mmore.tui.theme import ACCENT, ACCENT2, MUTED, OK, QMARK, QSTYLE, console + +# --------------------------------------------------------------------------- +# Stage β†’ extras mapping +# --------------------------------------------------------------------------- + +_STAGE_EXTRAS: dict[str, list[str]] = { + "process": ["process"], + "postprocess": ["process"], + "index": ["index"], + "rag": ["rag"], + "ragcli": ["rag"], + "retrieve": ["rag", "api"], + "websearch": ["websearch"], +} + +_COMPUTE_EXTRAS = [ + ("cpu", "CPU-only (no CUDA)"), + ("cu126", "CUDA 12.6 (GPU)"), +] + +# --------------------------------------------------------------------------- +# Stage β†’ env vars that may be needed +# --------------------------------------------------------------------------- + +_STAGE_ENV_VARS: dict[str, list[tuple[str, str, str]]] = { + # (var_name, description, default_or_empty) + "process": [ + ("ROOT_OUT_DIR", "Root output directory for processed results", ""), + ("ROOT_IN_DIR", "Root input directory for source documents", ""), + ], + "rag": [ + ("OPENAI_API_KEY", "OpenAI API key (for GPT models)", ""), + ("ANTHROPIC_API_KEY", "Anthropic API key (for Claude models)", ""), + ("MISTRAL_API_KEY", "Mistral API key", ""), + ("COHERE_API_KEY", "Cohere API key", ""), + ("HF_TOKEN", "HuggingFace token (for gated models)", ""), + ], + "websearch": [ + ("TAVILY_API_KEY", "Tavily API key (optional, DuckDuckGo used otherwise)", ""), + ], +} + +# Aliases: ragcli and retrieve share rag's env vars +_STAGE_ENV_VARS["ragcli"] = _STAGE_ENV_VARS["rag"] +_STAGE_ENV_VARS["retrieve"] = _STAGE_ENV_VARS["rag"] + +# Profiling env vars (always available) +_PROFILING_VARS: list[tuple[str, str, str]] = [ + ("MMORE_PROFILING_ENABLED", "Enable profiling", "false"), + ("MMORE_PROFILING_OUTPUT_DIR", "Profiling output directory", "./profiling_output"), +] + + +def _detect_installed_stages() -> dict[str, bool]: + """Check which stages have their deps installed.""" + return { + name: check_stage_available(spec) is None for name, spec in REGISTRY.items() + } + + +def _pick_stages() -> list[str]: + """Ask the user which pipeline stages they want to use.""" + installed = _detect_installed_stages() + choices = [] + for name, spec in REGISTRY.items(): + label = f"{name:<12} β€” {spec.description}" + if installed[name]: + label += " [dim](installed)[/dim]" + choices.append( + questionary.Choice(label, value=name, checked=not installed[name]) + ) + + selected = _ask( + questionary.checkbox( + "Which stages do you want to set up?", + choices=choices, + style=QSTYLE, + qmark=QMARK, + ) + ) + return selected + + +def _pick_compute() -> str: + """Ask the user which compute backend to use.""" + choices = [ + questionary.Choice(f"{name:<6} β€” {desc}", value=name) + for name, desc in _COMPUTE_EXTRAS + ] + return _ask( + questionary.select( + "Compute backend", + choices=choices, + style=QSTYLE, + qmark=QMARK, + ) + ) + + +def _build_uv_command(stages: list[str], compute: str) -> list[str]: + """Build the uv sync command from selected stages + compute.""" + extras: set[str] = {"tui"} # always include TUI + for stage in stages: + extras.update(_STAGE_EXTRAS.get(stage, [])) + extras.add(compute) + + cmd = [sys.executable, "-m", "uv", "sync"] + for extra in sorted(extras): + cmd.extend(["--extra", extra]) + return cmd + + +def _install_deps(stages: list[str], compute: str) -> bool: + """Run uv sync with the right extras. Returns True on success.""" + cmd = _build_uv_command(stages, compute) + display_cmd = " ".join(cmd[2:]) # skip python -m prefix for display + console.print(f"\n [bold]Running:[/] {display_cmd}\n") + + result = subprocess.run(cmd, cwd=os.getcwd()) + if result.returncode == 0: + console.print(f" [{OK}]βœ“[/] Dependencies installed successfully") + return True + console.print(" [bold red]βœ—[/] Installation failed β€” check output above") + return False + + +def _collect_env_vars(stages: list[str]) -> dict[str, str]: + """Prompt the user for env vars needed by their selected stages.""" + seen: set[str] = set() + env_vars: dict[str, str] = {} + + # Gather all relevant vars (deduplicated) + all_vars: list[tuple[str, str, str]] = [] + for stage in stages: + for var in _STAGE_ENV_VARS.get(stage, []): + if var[0] not in seen: + seen.add(var[0]) + all_vars.append(var) + + if not all_vars: + return env_vars + + console.print( + Panel( + "Set environment variables for your selected stages.\n" + "Leave blank to skip β€” you can always edit the .env file later.", + title="[bold]Environment variables[/bold]", + border_style=ACCENT, + padding=(1, 2), + ) + ) + + for var_name, description, default in all_vars: + # Check if already set in environment + current = os.environ.get(var_name, "") + hint = f" [dim](current: {current[:20]}…)[/dim]" if current else "" + value = _prompt(f"{var_name} β€” {description}{hint}", default=current or default) + if value: + env_vars[var_name] = value + + # Optionally add profiling vars + if _confirm("Configure profiling settings?", default=False): + for var_name, description, default in _PROFILING_VARS: + value = _prompt(f"{var_name} β€” {description}", default=default) + if value: + env_vars[var_name] = value + + return env_vars + + +def _print_export_commands(env_vars: dict[str, str]) -> None: + """Print export commands for the collected env vars. + + Displays a table with masked values, then prints the shell commands + the user can copy-paste into their shell or profile file. + """ + if not env_vars: + console.print(" [dim]No environment variables needed.[/dim]") + return + + table = Table( + title="[bold]Environment variables[/bold]", + title_style=ACCENT2, + border_style=ACCENT, + show_lines=False, + ) + table.add_column("Variable", style="bold") + table.add_column("Value", style=MUTED) + + for key, value in env_vars.items(): + # Mask API keys and tokens + if "KEY" in key or "TOKEN" in key: + display = value[:4] + "…" + value[-4:] if len(value) > 8 else "****" + else: + display = value + table.add_row(key, display) + + console.print(table) + console.print() + console.print( + Panel( + "\n".join( + f'export {k}="{v}"' if " " in v else f"export {k}={v}" + for k, v in env_vars.items() + ), + title="[bold]Add to your shell profile (e.g. ~/.bashrc or ~/.zshrc)[/bold]", + border_style=ACCENT, + padding=(1, 2), + ) + ) + + +def run_setup_wizard() -> None: + """Full setup wizard: pick stages β†’ install deps β†’ print export commands.""" + console.print( + Panel( + Text( + "This wizard will:\n" + " 1. Install the right Python dependencies for your pipeline\n" + " 2. Show the environment variables you need to export", + ), + title="[bold]Setup wizard[/bold]", + border_style=ACCENT2, + padding=(1, 2), + ) + ) + + # Step 1: pick stages + stages = _pick_stages() + if not stages: + console.print(" [dim]No stages selected β€” nothing to do.[/dim]") + return + + # Step 2: pick compute backend + compute = _pick_compute() + + # Step 3: show install command and confirm + cmd = _build_uv_command(stages, compute) + display_cmd = " ".join(cmd[2:]) + console.print( + Panel( + Text(display_cmd), + title="[bold]Install command[/bold]", + border_style=ACCENT, + padding=(0, 2), + ) + ) + if _confirm("Install dependencies now?", default=True): + if not _install_deps(stages, compute): + if not _confirm( + "Continue to env var setup despite install failure?", default=False + ): + return + + # Step 4: collect env vars + env_vars = _collect_env_vars(stages) + + # Step 5: print export commands + _print_export_commands(env_vars) + + console.print( + f"\n [{OK}]βœ“ Setup complete![/] Run [bold]mmore tui[/bold] to start.\n" + ) diff --git a/src/mmore/tui/theme.py b/src/mmore/tui/theme.py new file mode 100644 index 00000000..4a7aeb6d --- /dev/null +++ b/src/mmore/tui/theme.py @@ -0,0 +1,128 @@ +"""Shared visuals: banner, palette, panel helpers.""" + +from __future__ import annotations + +import time +from typing import Any, Callable + +from questionary import Style +from rich.align import Align +from rich.console import Console, Group +from rich.panel import Panel +from rich.text import Text + +console = Console() + +QSTYLE = Style( + [ + ("qmark", "fg:#5fd7ff bold"), + ("question", "bold"), + ("answer", "fg:#ff5fd7 bold"), + ("pointer", "fg:#5fd7ff bold"), + ("highlighted", "fg:#5fd7ff bold"), + ("selected", "fg:#ff5fd7"), + ("instruction", "fg:#808080 italic"), + ("disabled", "fg:#ffaf00 italic"), + ] +) +QMARK = "β–Έ" + +# Palette +ACCENT = "bright_cyan" +ACCENT2 = "magenta" +MUTED = "grey58" +OK = "bold green" +WARN = "yellow" +ERR = "bold red" + +BANNER = r""" + + β–ˆβ–ˆβ–ˆβ•— β–ˆβ–ˆβ–ˆβ•—β–ˆβ–ˆβ–ˆβ•— β–ˆβ–ˆβ–ˆβ•— β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ•— β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ•— β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ•— + β–ˆβ–ˆβ–ˆβ–ˆβ•— β–ˆβ–ˆβ–ˆβ–ˆβ•‘β–ˆβ–ˆβ–ˆβ–ˆβ•— β–ˆβ–ˆβ–ˆβ–ˆβ•‘β–ˆβ–ˆβ•”β•β•β•β–ˆβ–ˆβ•—β–ˆβ–ˆβ•”β•β•β–ˆβ–ˆβ•—β–ˆβ–ˆβ•”β•β•β•β•β• + β–ˆβ–ˆβ•”β–ˆβ–ˆβ–ˆβ–ˆβ•”β–ˆβ–ˆβ•‘β–ˆβ–ˆβ•”β–ˆβ–ˆβ–ˆβ–ˆβ•”β–ˆβ–ˆβ•‘β–ˆβ–ˆβ•‘ β–ˆβ–ˆβ•‘β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ•”β•β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ•— + β–ˆβ–ˆβ•‘β•šβ–ˆβ–ˆβ•”β•β–ˆβ–ˆβ•‘β–ˆβ–ˆβ•‘β•šβ–ˆβ–ˆβ•”β•β–ˆβ–ˆβ•‘β–ˆβ–ˆβ•‘ β–ˆβ–ˆβ•‘β–ˆβ–ˆβ•”β•β•β–ˆβ–ˆβ•—β–ˆβ–ˆβ•”β•β•β• + β–ˆβ–ˆβ•‘ β•šβ•β• β–ˆβ–ˆβ•‘β–ˆβ–ˆβ•‘ β•šβ•β• β–ˆβ–ˆβ•‘β•šβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ•”β•β–ˆβ–ˆβ•‘ β–ˆβ–ˆβ•‘β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ•— + β•šβ•β• β•šβ•β•β•šβ•β• β•šβ•β• β•šβ•β•β•β•β•β• β•šβ•β• β•šβ•β•β•šβ•β•β•β•β•β•β• +""" + + +def _mmore_logo(text: str) -> Text: + """Color the banner like the mmore GitHub logo. + + Strategy, per character: + - The second `M` (columns 12:23 of every row) is rendered fully in yellow. + - Elsewhere: outline characters (`β•”β•—β•šβ•β•β•‘β•”β•β•—`, etc.) are white and the + filled `β–ˆ` blocks are black, giving the letters a hollow look. + """ + outline_chars = set("β•”β•—β•šβ•β•β•‘β• β•£β•¦β•©β•¬β•”β•β•—β”Œβ”β””β”˜β”€β”‚") + out = Text() + for line in text.splitlines(): + if not line.strip(): + out.append(line + "\n") + continue + left = line[:12] + mid = line[12:23] + right = line[23:] + + def _emit(segment: str) -> None: + for ch in segment: + if ch == "β–ˆ": + # explicit hex β€” terminal "black" often renders as dark grey + out.append(ch, style="#000000") + elif ch in outline_chars: + out.append(ch, style="bold #ffffff") + else: + out.append(ch) + + _emit(left) + out.append(mid, style="bold yellow") + _emit(right) + out.append("\n") + return out + + +def show_banner(subtitle: str = "interactive launcher") -> None: + body = Group( + Align.center(_mmore_logo(BANNER)), + Align.center(Text(subtitle, style=f"italic {MUTED}")), + ) + console.print( + Panel( + body, + border_style=ACCENT, + padding=(0, 2), + ) + ) + + +def section(title: str, body: str | Text, style: str = ACCENT) -> Panel: + return Panel( + body if isinstance(body, Text) else Text(body), + title=f"[bold]{title}[/bold]", + border_style=style, + padding=(1, 2), + ) + + +def run_step(label: str, fn: Callable[..., Any], **kwargs: Any) -> float: + """Print a start line, call fn(**kwargs), print a timed done line. + + Heavy pipeline commands emit their own logs via logging/click which bypass + rich.Console β€” a Live spinner would clash with them. Plain prints keep the + output readable while still showing progress. + """ + start = time.time() + console.print(f" [{ACCENT}]β–Έ[/] {label}…") + fn(**kwargs) + elapsed = time.time() - start + console.print(f" [{OK}]βœ“[/] {label} [dim]({elapsed:.1f}s)[/dim]") + return elapsed + + +def step_header(idx: int, total: int, name: str) -> None: + bar = "─" * 4 + console.print() + console.print( + f"[{ACCENT}]{bar}[/] [bold]Step {idx}/{total}[/bold] " + f"[{ACCENT2}]{name}[/] [{ACCENT}]{bar}[/]" + ) diff --git a/uv.lock b/uv.lock index 882dc200..1e1ff7b4 100644 --- a/uv.lock +++ b/uv.lock @@ -3689,9 +3689,11 @@ all = [ { name = "pymupdf" }, { name = "python-docx" }, { name = "python-pptx" }, + { name = "questionary" }, { name = "ragas" }, { name = "rarfile" }, { name = "requests" }, + { name = "rich" }, { name = "scipy", version = "1.15.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-5-mmore-cpu' and extra == 'extra-5-mmore-cu126')" }, { name = "scipy", version = "1.17.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-5-mmore-cpu' and extra == 'extra-5-mmore-cu126')" }, { name = "sentence-transformers" }, @@ -3701,6 +3703,7 @@ all = [ { name = "trafilatura" }, { name = "transformers" }, { name = "unidecode" }, + { name = "uv" }, { name = "uvicorn" }, { name = "xlrd" }, ] @@ -3801,6 +3804,11 @@ rag = [ { name = "sentence-transformers" }, { name = "transformers" }, ] +tui = [ + { name = "questionary" }, + { name = "rich" }, + { name = "uv" }, +] websearch = [ { name = "ddgs" }, { name = "tavily-python" }, @@ -3848,7 +3856,7 @@ requires-dist = [ { name = "milvus-lite", marker = "extra == 'index'", specifier = "==2.5.1" }, { name = "milvus-model", marker = "extra == 'index'", specifier = ">=0.2.12" }, { name = "mmore", extras = ["index"], marker = "extra == 'rag'" }, - { name = "mmore", extras = ["process", "rag", "api", "websearch"], marker = "extra == 'all'" }, + { name = "mmore", extras = ["process", "rag", "api", "websearch", "tui"], marker = "extra == 'all'" }, { name = "motor", marker = "extra == 'api'", specifier = ">=3.5" }, { name = "moviepy", marker = "extra == 'process'", specifier = ">=2.0" }, { name = "nltk", marker = "extra == 'rag'", specifier = ">=3.9" }, @@ -3870,10 +3878,12 @@ requires-dist = [ { name = "python-dotenv", specifier = ">=1.0" }, { name = "python-pptx", marker = "extra == 'process'" }, { name = "pyyaml", specifier = ">=6.0" }, + { name = "questionary", marker = "extra == 'tui'", specifier = ">=2.0" }, { name = "ragas", marker = "extra == 'rag'", specifier = ">=0.2" }, { name = "rarfile", marker = "extra == 'process'", specifier = ">=4.1" }, { name = "requests", marker = "extra == 'api'", specifier = ">=2.31" }, { name = "requests", marker = "extra == 'process'", specifier = ">=2.31" }, + { name = "rich", marker = "extra == 'tui'", specifier = ">=13" }, { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.4.0" }, { name = "scipy", marker = "extra == 'index'", specifier = ">=1.8" }, { name = "sentence-transformers", marker = "extra == 'index'" }, @@ -3894,11 +3904,12 @@ requires-dist = [ { name = "transformers", marker = "extra == 'process'", specifier = ">=4.44" }, { name = "typing-extensions", specifier = ">=4.15.0,<5.0" }, { name = "unidecode", marker = "extra == 'process'" }, + { name = "uv", marker = "extra == 'tui'" }, { name = "uvicorn", marker = "extra == 'api'", specifier = ">=0.29" }, { name = "validators", specifier = ">=0.28" }, { name = "xlrd", marker = "extra == 'process'", specifier = ">=2.0.1" }, ] -provides-extras = ["process", "index", "rag", "api", "all", "cpu", "cu126", "websearch", "dev"] +provides-extras = ["process", "index", "rag", "api", "tui", "all", "cpu", "cu126", "websearch", "dev"] [[package]] name = "motor" @@ -5896,6 +5907,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c1/1b/f7ea6cde25621cd9236541c66ff018f4268012a534ec31032bcb187dc5e7/proglog-0.1.12-py3-none-any.whl", hash = "sha256:ccaafce51e80a81c65dc907a460c07ccb8ec1f78dc660cfd8f9ec3a22f01b84c", size = 6337, upload-time = "2025-05-09T14:36:16.798Z" }, ] +[[package]] +name = "prompt-toolkit" +version = "3.0.52" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "wcwidth" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a1/96/06e01a7b38dce6fe1db213e061a4602dd6032a8a97ef6c1a862537732421/prompt_toolkit-3.0.52.tar.gz", hash = "sha256:28cde192929c8e7321de85de1ddbe736f1375148b02f2e17edd840042b1be855", size = 434198, upload-time = "2025-08-27T15:24:02.057Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/84/03/0d3ce49e2505ae70cf43bc5bb3033955d2fc9f932163e84dc0779cc47f48/prompt_toolkit-3.0.52-py3-none-any.whl", hash = "sha256:9aac639a3bbd33284347de5ad8d68ecc044b91a762dc39b7c21095fcd6a19955", size = 391431, upload-time = "2025-08-27T15:23:59.498Z" }, +] + [[package]] name = "propcache" version = "0.4.1" @@ -6874,6 +6897,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f1/12/de94a39c2ef588c7e6455cfbe7343d3b2dc9d6b6b2f40c4c6565744c873d/pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b", size = 149341, upload-time = "2025-09-25T21:32:56.828Z" }, ] +[[package]] +name = "questionary" +version = "2.1.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "prompt-toolkit" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f6/45/eafb0bba0f9988f6a2520f9ca2df2c82ddfa8d67c95d6625452e97b204a5/questionary-2.1.1.tar.gz", hash = "sha256:3d7e980292bb0107abaa79c68dd3eee3c561b83a0f89ae482860b181c8bd412d", size = 25845, upload-time = "2025-08-28T19:00:20.851Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3c/26/1062c7ec1b053db9e499b4d2d5bc231743201b74051c973dadeac80a8f43/questionary-2.1.1-py3-none-any.whl", hash = "sha256:a51af13f345f1cdea62347589fbb6df3b290306ab8930713bfae4d475a7d4a59", size = 36753, upload-time = "2025-08-28T19:00:19.56Z" }, +] + [[package]] name = "ragas" version = "0.4.3" @@ -9113,6 +9148,32 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ed/d0/5bf7cbf1ac138c92b9ac21066d18faf4d7e7f651047b700eb192ca4b9fdb/uuid_utils-0.14.1-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:258186964039a8e36db10810c1ece879d229b01331e09e9030bc5dcabe231bd2", size = 364700, upload-time = "2026-02-20T22:50:21.732Z" }, ] +[[package]] +name = "uv" +version = "0.11.16" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a1/99/025154611a4bd97a23851574c15d73bb71ada09d35f092d6972f9ac87f70/uv-0.11.16.tar.gz", hash = "sha256:4b435fcb0af8f34833dcc1903a8a223856437efd0d515c2160a2871def221238", size = 4177038, upload-time = "2026-05-21T22:10:01.009Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/55/e3/8b8cfc802bc476c67e31a39725538193265cf3a19585b4a60c232659f919/uv-0.11.16-py3-none-linux_armv6l.whl", hash = "sha256:c9e9d9cb73ee8cd2ad696dbf1bc3232abaac363270557684b6b85a2bdb8eb276", size = 23508087, upload-time = "2026-05-21T22:10:06.227Z" }, + { url = "https://files.pythonhosted.org/packages/45/78/d5ca91c636ac88e902b6b3ff31ad32d2d02663232d844aff871467a323d2/uv-0.11.16-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:01172238a75e42a5a55d12555cd9ec98bee24249f3645b98a4b32eb5f1ff5e43", size = 23028989, upload-time = "2026-05-21T22:09:50.127Z" }, + { url = "https://files.pythonhosted.org/packages/c7/26/c84580dfec5a87c36fb1218eac17c5194fa3e58e2a9232cf085d69eb6bed/uv-0.11.16-py3-none-macosx_11_0_arm64.whl", hash = "sha256:c75f9b5bac49b97131973910c220feac60fe47b10a333941b237ff0ae4b36721", size = 21572023, upload-time = "2026-05-21T22:09:58.703Z" }, + { url = "https://files.pythonhosted.org/packages/84/68/ba2bdc64fea96ef8c9796a991f244541b65bb9d31c661b322cc724857a4e/uv-0.11.16-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.musllinux_1_1_aarch64.whl", hash = "sha256:a801484f4507b6c2133e557350f3143b61b8f8b61dddb01ff7b84a74cdfab1fb", size = 23289936, upload-time = "2026-05-21T22:10:15.423Z" }, + { url = "https://files.pythonhosted.org/packages/c9/81/74922f693d5804a77d009338ca8dc709eff871fb60d9f2c263dede8d77d1/uv-0.11.16-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.musllinux_1_1_armv7l.whl", hash = "sha256:eb538069e768b042cf870be700a210518ce628e36d99d9a83b85acaf484d7f6a", size = 23020906, upload-time = "2026-05-21T22:10:24.242Z" }, + { url = "https://files.pythonhosted.org/packages/60/81/cda8886f5df4dd28854a9b97bcc3ee6a7d1b5b5b23aaaccfbf1ed3e5e2bf/uv-0.11.16-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d7cdb23457a4d1bc76bf1016638ea1d1ada0e8e032f656168e933d4d17c47e72", size = 23004220, upload-time = "2026-05-21T22:10:32.847Z" }, + { url = "https://files.pythonhosted.org/packages/98/7c/65837e07de23f0a40ab860bc6601f7c022d4bcf4b97ca79b6c35a2e72e65/uv-0.11.16-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:451327388d59ac3041cbda474296f3ceeafac5b1f645476198e7b95f504fcfd5", size = 24319651, upload-time = "2026-05-21T22:10:21.492Z" }, + { url = "https://files.pythonhosted.org/packages/85/70/9d364542bf118433b60ed71422e47d2c8c470aca7d3aef0df9449a5f726a/uv-0.11.16-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e7992b8276149b3ffaf35ce9434702d3e16bae6ec393e99df209b870a7e19eb0", size = 25359517, upload-time = "2026-05-21T22:09:46.519Z" }, + { url = "https://files.pythonhosted.org/packages/99/b4/650896e8cff5a3289cee860c41fd9876da83ca628c5871f9a61d5fc75c72/uv-0.11.16-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:83a8db9b3314d900e7a240105afce43f806c9e04c59ea10a40bdbdca84c6d0c5", size = 24563421, upload-time = "2026-05-21T22:10:35.82Z" }, + { url = "https://files.pythonhosted.org/packages/b1/7d/184711a8c02466e1486d57efdc9394ce09cbf43ee2c5794da70bd25db3fb/uv-0.11.16-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9b10086165189c39c53142a0e2f34e0b8889ef681886f589ed17be45a1a774c7", size = 24676607, upload-time = "2026-05-21T22:10:39.784Z" }, + { url = "https://files.pythonhosted.org/packages/ee/3f/5b338df6505f77f73c20eae38cb29f57d14dba56dac835386e3dc6e2a5d6/uv-0.11.16-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:cfe1f06fb8f135a735a961065d5ee90f99cccf41749fb1f964edb5b3c3dae19b", size = 23401615, upload-time = "2026-05-21T22:10:30.124Z" }, + { url = "https://files.pythonhosted.org/packages/b6/f9/54bbcbc77443dc76468f09a49cc9f4f92ca49b4159a011c6010d223de4ea/uv-0.11.16-py3-none-manylinux_2_31_riscv64.musllinux_1_1_riscv64.whl", hash = "sha256:2454f80d8b548fb2e246151578809b14ad4395b3f357d738bae1af11918e91af", size = 24104468, upload-time = "2026-05-21T22:09:53.323Z" }, + { url = "https://files.pythonhosted.org/packages/3e/0a/b5f105514fddea5110fe3947cd18a9f199ff93dbad78e5e5a08e1b5d0ea2/uv-0.11.16-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:4249d57a563165d368050680deeb722f9c0053a0dbf3244b11cca3e6d85a3c7d", size = 24164861, upload-time = "2026-05-21T22:10:09.458Z" }, + { url = "https://files.pythonhosted.org/packages/f6/01/15d4ca2be7257862b077a9077ac31ce81c419f35ef7994e76356a317716b/uv-0.11.16-py3-none-musllinux_1_1_i686.whl", hash = "sha256:374c30126483ce95675c5de49e54c2454ddedb01c17b8321417fe4eb9da83406", size = 23644919, upload-time = "2026-05-21T22:10:03.129Z" }, + { url = "https://files.pythonhosted.org/packages/49/bf/9de3e262e6ff93aec2e0a4c238857293fd2c616dd79f25bb440f126bf32c/uv-0.11.16-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:746edfc9d1d8cd03dd58739989f634d3580648048d09f81a9c68da74c4eb9d62", size = 24973746, upload-time = "2026-05-21T22:10:18.413Z" }, + { url = "https://files.pythonhosted.org/packages/f6/7d/f4126dce104f1b5d0b451ce3ca41c4db69b963c2e78c3465fcda6440de31/uv-0.11.16-py3-none-win32.whl", hash = "sha256:50299b20aab2d28c05ff27d781ce2af3f5af2102bc304dc07a4ad54b05e2af8a", size = 22400991, upload-time = "2026-05-21T22:10:27.119Z" }, + { url = "https://files.pythonhosted.org/packages/8f/38/99627cb995a03389b227ce4b12b08e770565d0aa7850cd0420973194a638/uv-0.11.16-py3-none-win_amd64.whl", hash = "sha256:e901aafa5007beffafe57bfa44e5e248d99fb5d97036a3718fd65cf9723c5cd3", size = 25067163, upload-time = "2026-05-21T22:10:12.317Z" }, + { url = "https://files.pythonhosted.org/packages/b6/68/3ed1c0bdfb4bec501e5cde73419b4f39c8a125ef905a85fc0f239f19eb9b/uv-0.11.16-py3-none-win_arm64.whl", hash = "sha256:d777cb29661cdfa7f90dae77406c85fb5b729bf8bc13941dc237958a1ea1ba00", size = 23502015, upload-time = "2026-05-21T22:09:56.014Z" }, +] + [[package]] name = "uvicorn" version = "0.42.0"