fglogan
diff --git a/‎.github/workflows/build_cuda_all.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/build_cuda_all.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/docs.yml‎
Lines changed: 20 additions & 17 deletions b/‎.github/workflows/docs.yml‎
Lines changed: 20 additions & 17 deletions
diff --git a/‎.gitignore‎
Lines changed: 4 additions & 1 deletion b/‎.gitignore‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎.typos.toml‎
Lines changed: 2 additions & 1 deletion b/‎.typos.toml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎AGENTS.md‎
Lines changed: 14 additions & 12 deletions b/‎AGENTS.md‎
Lines changed: 14 additions & 12 deletions
diff --git a/‎CLAUDE.md‎
Lines changed: 20 additions & 14 deletions b/‎CLAUDE.md‎
Lines changed: 20 additions & 14 deletions
@@ -14,7 +14,7 @@ jobs:
     build-and-push-image:
         strategy:
           matrix:
-            compute_capability: [75, 80, 86, 89, 90]
+            compute_capability: [80, 86, 89, 90]
           fail-fast: false
         runs-on: ubuntu-latest
 
 
@@ -1,5 +1,5 @@
 name: docs
-#https://dev.to/deciduously/prepare-your-rust-api-docs-for-github-pages-2n5i
+
 on:
   push:
     branches: ["master"]
@@ -18,28 +18,30 @@ concurrency:
 jobs:
   deploy:
     runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        rust: [stable]
     steps:
       - name: Checkout
         uses: actions/checkout@v4
-      - uses: actions-rs/toolchain@v1
+
+      - name: Setup Rust
+        uses: actions-rs/toolchain@v1
         with:
           profile: minimal
-          toolchain: ${{ matrix.rust }}
+          toolchain: stable
           override: true
+
+      - name: Setup mdbook
+        uses: peaceiris/actions-mdbook@v2
+        with:
+          mdbook-version: 'latest'
+
       - name: Setup Pages
         uses: actions/configure-pages@v5
-      - uses: actions-rs/cargo@v1
-        with:
-          command: doc
-          args: --no-deps
-      - name: Build docs
-        run: |
-          rm -rf ./docs
-          echo "<meta http-equiv=\"refresh\" content=\"0; url=mistralrs\">" > target/doc/index.html
-          cp -r target/doc ./docs
+
+      # Build mdbook (main documentation)
+      - name: Build mdbook
+        run: mdbook build docs
+
+      # Build Python docs
       - name: Build Python docs
         run: |
           python3 -m venv myenv
@@ -48,8 +50,9 @@ jobs:
           cd mistralrs-pyo3
           maturin develop
           cd ..
-          pdoc mistralrs -o ./docs/pyo3
+          pdoc mistralrs -o ./docs/book/pyo3
+
       - name: Deploy
         uses: JamesIves/github-pages-deploy-action@v4
         with:
-          folder: ./docs
+          folder: ./docs/book
@@ -5,4 +5,7 @@
 .DS_Store
 .idea
 mistral.rs/
-mistralrs-web-chat/cache
+mistralrs-web-chat/cache
+
+# mdbook output
+docs/book/
@@ -3,7 +3,8 @@ extend-exclude = [
     ".git/",
     "calibration_data/",
     "examples/server/phi3_duckduckgo_mistral.rs.ipynb",
-    "mistralrs-web-chat/static/"
+    "mistralrs-web-chat/static/",
+    "mistralrs-cli/static/"
 ]
 ignore-hidden = false
 
 
@@ -11,10 +11,10 @@ This file provides instructions for AI agents to understand the layout of the `m
 - `/mistralrs-quant/`     : Quantization support (ISQ, GGUF, GPTQ, AWQ, FP8, HQQ, etc.)
 - `/mistralrs-paged-attn/`: PagedAttention implementation
 - `/mistralrs-pyo3/`      : Python bindings (PyO3)
-- `/mistralrs-server/`    : CLI & OpenAI-compatible HTTP server (subcommands: run/vision-plain, diffusion, speech)
+- `/mistralrs-cli/`       : Unified CLI binary (commands: run, serve, bench, from-config)
 - `/mistralrs-server-core/`: Shared server core logic
-- `/mistralrs-web-chat/`  : Web chat application (static assets & backend integration)
-- `/mistralrs-bench/`     : Benchmarking tools
+- `/mistralrs-web-chat/`  : (Deprecated) Use `mistralrs serve --ui` instead
+- `/mistralrs-bench/`     : (Deprecated) Use `mistralrs bench` instead
 - `/docs/`                : Markdown documentation for models, features, and guides
 - `/examples/`            : Usage examples (Rust, Python, server samples, notebooks)
 - `/chat_templates/`      : Chat formatting templates (JSON/Jinja)
@@ -26,17 +26,17 @@ Mistral.rs supports multiple model types and advanced features via dedicated cra
 
 - **Text Inference**
   - Crate: `mistralrs-core` (low-level ops), `mistralrs` (API wrapper)
-  - CLI: `run` / `plain` subcommand in `mistralrs-server`
+  - CLI: `mistralrs run -m <model>` or `mistralrs serve -m <model>` (auto-detects model type)
   - Docs: `docs/SAMPLING.md`, `docs/TOOL_CALLING.md`
 - **Vision Models**
   - Crate: `mistralrs-vision`
-  - CLI: `vision-plain` subcommand
+  - CLI: `mistralrs run -m <model>` (auto-detects vision models)
   - Docs: `docs/VISION_MODELS.md`, `docs/IMAGEGEN_MODELS.md`, `docs/IMATRIX.md`
 - **Diffusion Models**
-  - CLI: `diffusion` subcommand
+  - CLI: `mistralrs run -m <model>` (auto-detects diffusion models)
   - Docs: `docs/FLUX.md`
 - **Speech Models**
-  - CLI: `speech` subcommand
+  - CLI: `mistralrs run -m <model>` (auto-detects speech models)
   - Docs: `docs/DIA.md`
 - **Quantization & ISQ**
   - Crate: `mistralrs-quant`
@@ -58,10 +58,10 @@ Mistral.rs supports multiple model types and advanced features via dedicated cra
    ```bash
    cargo build --workspace --release --features "<features>"
    ```
-4. Or build/install only the server binary:
+4. Or build/install only the CLI binary:
    ```bash
-   cargo build --release --package mistralrs-server --features "<features>"
-   cargo install --path mistralrs-server --features "<features>"
+   cargo build --release --package mistralrs-cli --features "<features>"
+   cargo install --path mistralrs-cli --features "<features>"
    ```
 
 ## Models
@@ -116,9 +116,11 @@ Avoid returning TODOs.
   ```bash
   python3 examples/python/<script>.py
   ```
-- Run server/CLI:
+- Run CLI:
   ```bash
-  ./target/release/mistralrs-server -i <mode> -m <model> [options]
+  mistralrs run -m <model>        # Interactive mode
+  mistralrs serve -p 1234 -m <model>  # Server mode
+  mistralrs bench -m <model>      # Benchmarking
   ```
 
 ## CI Parity
 
@@ -4,7 +4,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
 
 ## Project Overview
 
-mistral.rs is a blazing-fast LLM inference engine written in Rust. It supports text, vision, image generation, and speech models with multiple APIs (Rust, Python, OpenAI HTTP, MCP).
+mistral.rs is a blazing-fast LLM inference engine written in Rust. It supports text, vision, image generation, and speech models with Rust and Python SDKs, plus OpenAI HTTP and MCP APIs.
 
 ## Essential Commands
 
@@ -19,8 +19,8 @@ cargo build --release --features "cuda flash-attn cudnn"
 # With Metal support (macOS)
 cargo build --release --features metal
 
-# Install server binary
-cargo install --path mistralrs-server --features <features>
+# Install CLI binary
+cargo install --path mistralrs-cli --features <features>
 ```
 
 ### Testing & Quality
@@ -40,14 +40,20 @@ cargo clippy --workspace --tests --examples -- -D warnings
 
 ### Running Models
 ```bash
-# Run interactive mode with plain model
-cargo run --release --features <features> -- -i plain -m <model_id> -a <arch>
+# Run interactive mode (model type auto-detected)
+mistralrs run -m <model_id>
 
 # Run with GGUF quantized model
-cargo run --release --features <features> -- -i gguf -f <file> -t <tokenizer>
+mistralrs run --format gguf -m <repo> -f <file>
 
 # Run server
-cargo run --release --features <features> -- --port 1234 <model_args>
+mistralrs serve -p 1234 -m <model_id>
+
+# Run server with web UI
+mistralrs serve --ui -m <model_id>
+
+# Run benchmarks
+mistralrs bench -m <model_id>
 ```
 
 ## Models
@@ -60,16 +66,16 @@ You should also look for a model.safetensors.index.json file for the model at ha
 
 ### Workspace Structure
 - `mistralrs-core/` - Core inference engine, model implementations, pipelines
-- `mistralrs-server/` - CLI binary entry point
+- `mistralrs-cli/` - Unified CLI binary (commands: run, serve, bench, from-config)
 - `mistralrs-server-core/` - HTTP server routing, OpenAI API implementation
-- `mistralrs-pyo3/` - Python bindings (PyO3)
-- `mistralrs/` - High-level Rust API
+- `mistralrs-pyo3/` - Python SDK (PyO3 bindings)
+- `mistralrs/` - Rust SDK (high-level crate)
 - `mistralrs-vision/` - Vision model support
 - `mistralrs-quant/` - Quantization implementations (ISQ, GGUF, GPTQ, etc.)
 - `mistralrs-paged-attn/` - PagedAttention implementation
 - `mistralrs-audio/` - Audio processing
 - `mistralrs-mcp/` - Model Context Protocol client
-- `mistralrs-bench/` - Benchmarking tools
+- `mistralrs-bench/` - (Deprecated) Use `mistralrs bench` instead
 
 ### Key Design Patterns
 
@@ -88,7 +94,7 @@ When adding new model architectures:
 2. Add pipeline support in `mistralrs-core/src/pipeline/`
 3. Update model detection in `mistralrs-core/src/pipeline/normal.rs`
 4. Add architecture enum variant in `mistralrs-core/src/lib.rs`
-5. Update CLI args in `mistralrs-server/src/main.rs`
+5. Update CLI args in `mistralrs-cli/src/main.rs`
 
 When adding new quantization methods:
 1. Implement in `mistralrs-quant/src/`
@@ -100,8 +106,8 @@ When adding new quantization methods:
 - `mistralrs-core/src/engine/mod.rs` - Main engine orchestration
 - `mistralrs-core/src/pipeline/mod.rs` - Pipeline trait and common logic
 - `mistralrs-server-core/src/routes.rs` - HTTP API endpoints
-- `mistralrs-pyo3/src/lib.rs` - Python API entry point
-- `mistralrs/examples/` - Usage examples for Rust API
+- `mistralrs-pyo3/src/lib.rs` - Python SDK entry point
+- `mistralrs/examples/` - Usage examples for Rust SDK
 
 ### Testing Approach
Original file line number	Diff line number	Diff line change
`@@ -3,7 +3,8 @@ extend-exclude = [`
`3`	`3`	`".git/",`
`4`	`4`	`"calibration_data/",`
`5`	`5`	`"examples/server/phi3_duckduckgo_mistral.rs.ipynb",`
`6`		`- "mistralrs-web-chat/static/"`
	`6`	`+ "mistralrs-web-chat/static/",`
	`7`	`+ "mistralrs-cli/static/"`
`7`	`8`	`]`
`8`	`9`	`ignore-hidden = false`
`9`	`10`