diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..5430430 --- /dev/null +++ b/.env.example @@ -0,0 +1,22 @@ +# Milvus Vector Database +MILVUS_HOST=localhost +MILVUS_PORT=19530 +MILVUS_COLLECTION=docs_rag +MILVUS_VECTOR_FIELD=vector + +# LLM Configuration +# For local development with Ollama: +KSERVE_URL=http://localhost:11434/v1/chat/completions +MODEL=llama3.1:8b +# For production with KServe: +# KSERVE_URL=http://llama.docs-agent.svc.cluster.local/openai/v1/chat/completions +# MODEL=llama3.1-8B + +# Embedding Model +EMBEDDING_MODEL=sentence-transformers/all-mpnet-base-v2 + +# API Server +PORT=8000 + +# GitHub Token (optional - increases API rate limit) +# GITHUB_TOKEN=ghp_your_token_here diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..b161c7b --- /dev/null +++ b/Dockerfile @@ -0,0 +1,25 @@ +FROM python:3.11-slim + +# Create non-root user +RUN useradd -m -u 1000 appuser +WORKDIR /app + +# Install server dependencies +COPY server-https/requirements.txt ./requirements-server.txt +RUN pip install --no-cache-dir -r requirements-server.txt + +# Install ingestion dependencies +RUN pip install --no-cache-dir requests langchain-text-splitters + +# Environment variables +ENV PORT=8000 + +# Switch to non-root before running the app +USER appuser + +# Copy server and ingestion script +COPY server-https/app.py /app/ +COPY scripts/local_ingest.py /app/ + +EXPOSE 8000 +CMD ["python", "-u", "app.py"] diff --git a/README.md b/README.md index b387d19..a3ce5cd 100644 --- a/README.md +++ b/README.md @@ -9,11 +9,13 @@ The official LLM implementation of the Kubeflow Documentation Assistant powered - [Overview](#overview) - [Architecture](#architecture) - [Prerequisites](#prerequisites) -- [Installation](#installation) +- [Local Development Setup](#local-development-setup) +- [Installation (Kubernetes)](#installation) - [Milvus Vector Database](#milvus-vector-database) - [KServe Inference Service](#kserve-inference-service) - [Kubeflow Pipelines](#kubeflow-pipelines) - [API Server](#api-server) +- [Local vs Production](#local-vs-production) - [Usage](#usage) - [Configuration](#configuration) - [Troubleshooting](#troubleshooting) @@ -58,8 +60,97 @@ Kubeflow users often struggle to find relevant information across the extensive - GPU nodes (for LLM inference) - SSL certificate (for HTTPS API) +## Local Development Setup + +For local development without a Kubernetes cluster, you can use Docker Compose to run the full stack locally. + +### Prerequisites + +- [Docker Desktop](https://www.docker.com/products/docker-desktop/) +- [Ollama](https://ollama.ai/) (for local LLM inference) + +### 1. Install Ollama + +Ollama runs the Llama 3.1 model locally on your machine. + +**macOS:** +```bash +brew install ollama +``` + +**Linux:** +```bash +curl -fsSL https://ollama.com/install.sh | sh +``` + +Start the Ollama server and pull the model: +```bash +ollama serve +ollama pull llama3.1:8b +``` + +### 2. Clone and Configure + +```bash +git clone https://github.com/kubeflow/docs-agent.git +cd docs-agent +cp .env.example .env +``` + +### 3. Start All Services + +```bash +docker-compose up --build -d +``` + +This starts: +- **Milvus** — vector database (port `19530`) +- **etcd** — metadata storage for Milvus +- **MinIO** — object storage for Milvus +- **API Server** — FastAPI server (port `8000`) with hot reload + +> **Note:** The `--build` flag is only needed on the first run or after changing the Dockerfile/requirements. For subsequent runs, use `docker-compose up -d`. + +Verify all services are running: +```bash +docker-compose ps +``` + +### 4. Ingest Documentation + +Populate Milvus with Kubeflow documentation: +```bash +docker exec docs-agent-api python local_ingest.py +``` + +This is a one-time step. Re-run only when documentation is updated. + +### 5. Test + +```bash +curl -X POST http://localhost:8000/chat \ + -H "Content-Type: application/json" \ + -d '{"message": "What is Kubeflow?", "stream": false}' +``` + +### Docker Compose Commands + +| Command | Description | +|---------|-------------| +| `docker-compose up -d` | Start all services in background | +| `docker-compose up --build -d` | Rebuild and start (after Dockerfile/requirements change) | +| `docker-compose ps` | Check service status | +| `docker-compose logs -f` | View logs | +| `docker-compose logs -f api-server` | View API server logs only | +| `docker-compose down` | Stop all services | +| `docker-compose down -v` | Stop and remove all data | + +--- + ## Installation +The following instructions are for deploying to a **Kubernetes cluster** in production. + ### Milvus Vector Database #### What is Milvus? @@ -536,6 +627,19 @@ if data.get('citations'): print(f"Sources: {data['citations']}") ``` +## Local vs Production + +| Component | Local Development | Production (Kubernetes) | +|-----------|------------------|------------------------| +| **Vector DB** | Docker Compose (Milvus standalone) | Helm chart on Kubernetes | +| **LLM** | Ollama (runs on CPU/Apple Silicon) | KServe + vLLM (NVIDIA GPU) | +| **Data Ingestion** | `scripts/local_ingest.py` | Kubeflow Pipelines | +| **API Server** | `python server-https/app.py` | Kubernetes Deployment | +| **Service Mesh** | Not needed | Istio (mTLS + RBAC) | +| **SSL** | Plain HTTP | HTTPS with certificates | + +Both setups use the same API format and produce identical results. The local setup uses lightweight alternatives that don't require Kubernetes or GPUs. + ## Configuration ### Environment Variables diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..094f60f --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,86 @@ +services: + etcd: + image: quay.io/coreos/etcd:v3.5.18 + container_name: milvus-etcd + environment: + - ETCD_AUTO_COMPACTION_MODE=revision + - ETCD_AUTO_COMPACTION_RETENTION=1000 + - ETCD_QUOTA_BACKEND_BYTES=4294967296 + - ETCD_SNAPSHOT_COUNT=50000 + volumes: + - etcd_data:/etcd + command: etcd -advertise-client-urls=http://127.0.0.1:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd + healthcheck: + test: ["CMD", "etcdctl", "endpoint", "health"] + interval: 30s + timeout: 20s + retries: 3 + + minio: + image: minio/minio:RELEASE.2023-03-20T20-16-18Z + container_name: milvus-minio + environment: + MINIO_ACCESS_KEY: minioadmin + MINIO_SECRET_KEY: minioadmin + ports: + - "9001:9001" + volumes: + - minio_data:/minio_data + command: minio server /minio_data --console-address ":9001" + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"] + interval: 30s + timeout: 20s + retries: 3 + + milvus: + image: milvusdb/milvus:v2.4.17 + container_name: milvus-standalone + environment: + ETCD_ENDPOINTS: etcd:2379 + MINIO_ADDRESS: minio:9000 + ports: + - "19530:19530" + - "9091:9091" + volumes: + - milvus_data:/var/lib/milvus + command: ["milvus", "run", "standalone"] + depends_on: + etcd: + condition: service_healthy + minio: + condition: service_healthy + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:9091/healthz"] + interval: 30s + timeout: 20s + retries: 3 + + api-server: + build: + context: . + dockerfile: Dockerfile + container_name: docs-agent-api + ports: + - "8000:8000" + environment: + MILVUS_HOST: milvus + MILVUS_PORT: "19530" + MILVUS_COLLECTION: docs_rag + MILVUS_VECTOR_FIELD: vector + KSERVE_URL: http://host.docker.internal:11434/v1/chat/completions + MODEL: llama3.1:8b + EMBEDDING_MODEL: sentence-transformers/all-mpnet-base-v2 + PORT: "8000" + volumes: + - ./server-https/app.py:/app/app.py + - ./scripts/local_ingest.py:/app/local_ingest.py + command: ["python", "-u", "-m", "uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000", "--reload"] + depends_on: + milvus: + condition: service_healthy + +volumes: + etcd_data: + minio_data: + milvus_data: diff --git a/scripts/local_ingest.py b/scripts/local_ingest.py new file mode 100644 index 0000000..8c71e1e --- /dev/null +++ b/scripts/local_ingest.py @@ -0,0 +1,178 @@ +""" +Local data ingestion script for development. + +This replicates what the Kubeflow Pipeline does, but runs locally. +In production, use the KFP pipeline (pipelines/kubeflow-pipeline.py) instead. + +Usage: + source .env + python scripts/local_ingest.py +""" + +import os +import re +import json +import base64 +import requests +from datetime import datetime +from sentence_transformers import SentenceTransformer +from langchain_text_splitters import RecursiveCharacterTextSplitter +from pymilvus import connections, utility, FieldSchema, CollectionSchema, DataType, Collection + + +def connect_milvus(): + host = os.getenv("MILVUS_HOST", "localhost") + port = os.getenv("MILVUS_PORT", "19530") + connections.connect("default", host=host, port=port) + print(f"Connected to Milvus at {host}:{port}") + + +def create_collection(name): + if utility.has_collection(name): + Collection(name).drop() + print(f"Dropped existing collection: {name}") + + fields = [ + FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True), + FieldSchema(name="file_unique_id", dtype=DataType.VARCHAR, max_length=512), + FieldSchema(name="repo_name", dtype=DataType.VARCHAR, max_length=256), + FieldSchema(name="file_path", dtype=DataType.VARCHAR, max_length=512), + FieldSchema(name="file_name", dtype=DataType.VARCHAR, max_length=256), + FieldSchema(name="citation_url", dtype=DataType.VARCHAR, max_length=1024), + FieldSchema(name="chunk_index", dtype=DataType.INT64), + FieldSchema(name="content_text", dtype=DataType.VARCHAR, max_length=2000), + FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=768), + FieldSchema(name="last_updated", dtype=DataType.INT64), + ] + schema = CollectionSchema(fields, "RAG collection for documentation") + collection = Collection(name, schema) + print(f"Created collection: {name}") + return collection + + +def fetch_docs(repo_owner, repo_name, directory_path, token=None, max_files=20): + headers = {"Authorization": f"token {token}"} if token else {} + files = [] + + def fetch_recursive(path): + if len(files) >= max_files: + return + api_url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/contents/{path}" + response = requests.get(api_url, headers=headers) + response.raise_for_status() + items = response.json() + + for item in items: + if len(files) >= max_files: + return + if item["type"] == "file" and item["name"].endswith(".md"): + try: + file_resp = requests.get(item["url"], headers=headers) + file_resp.raise_for_status() + content = base64.b64decode(file_resp.json()["content"]).decode("utf-8") + files.append({ + "path": item["path"], + "content": content, + "file_name": item["name"], + }) + print(f" Fetched: {item['path']}") + except Exception as e: + print(f" Error fetching {item['path']}: {e}") + elif item["type"] == "dir": + fetch_recursive(item["path"]) + + fetch_recursive(directory_path) + print(f"Fetched {len(files)} files from GitHub") + return files + + +def clean_content(content): + content = re.sub(r"^\s*[+-]{3,}.*?[+-]{3,}\s*", "", content, flags=re.DOTALL | re.MULTILINE) + content = re.sub(r"\{\{.*?\}\}", "", content, flags=re.DOTALL) + content = re.sub(r"", "", content, flags=re.DOTALL) + content = re.sub(r"<[^>]+>", " ", content) + content = re.sub(r"https?://[^\s]+", "", content) + content = re.sub(r"\[([^\]]+)\]\([^\)]+\)", r"\1", content) + content = re.sub(r"\s+", " ", content) + return content.strip() + + +def chunk_and_embed(files, model, repo_name, base_url, chunk_size=1000, chunk_overlap=100): + splitter = RecursiveCharacterTextSplitter( + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + separators=["\n\n", "\n", ". ", " ", ""], + ) + records = [] + timestamp = int(datetime.now().timestamp()) + + for f in files: + content = clean_content(f["content"]) + if len(content) < 50: + print(f" Skipping (too short): {f['file_name']}") + continue + + chunks = splitter.split_text(content) + for idx, chunk in enumerate(chunks): + embedding = model.encode(chunk).tolist() + records.append({ + "file_unique_id": f"{repo_name}:{f['path']}", + "repo_name": repo_name, + "file_path": f["path"], + "file_name": f["file_name"], + "citation_url": f"{base_url}/{f['file_name'].replace('.md', '')}"[:1024], + "chunk_index": idx, + "content_text": chunk[:2000], + "vector": embedding, + "last_updated": timestamp, + }) + print(f" Chunked: {f['file_name']} -> {len(chunks)} chunks") + + print(f"Total: {len(records)} vectors") + return records + + +def store_vectors(collection, records): + if not records: + print("No records to store") + return + + batch_size = 1000 + for i in range(0, len(records), batch_size): + batch = records[i : i + batch_size] + collection.insert(batch) + + collection.flush() + collection.create_index( + "vector", + {"metric_type": "COSINE", "index_type": "IVF_FLAT", "params": {"nlist": 128}}, + ) + collection.load() + print(f"Stored {len(records)} vectors in Milvus") + + +def main(): + collection_name = os.getenv("MILVUS_COLLECTION", "docs_rag") + embedding_model = os.getenv("EMBEDDING_MODEL", "sentence-transformers/all-mpnet-base-v2") + github_token = os.getenv("GITHUB_TOKEN", "") + + connect_milvus() + collection = create_collection(collection_name) + + print("\nLoading embedding model...") + model = SentenceTransformer(embedding_model) + + print("\nFetching docs from GitHub...") + files = fetch_docs("kubeflow", "website", "content/en/docs", github_token) + + print("\nChunking and embedding...") + records = chunk_and_embed(files, model, "website", "https://www.kubeflow.org/docs") + + print("\nStoring in Milvus...") + store_vectors(collection, records) + + print("\nDone!") + + +if __name__ == "__main__": + main() diff --git a/server-https/Dockerfile b/server-https/Dockerfile index 2212413..499f104 100644 --- a/server-https/Dockerfile +++ b/server-https/Dockerfile @@ -1,4 +1,4 @@ - FROM python:3.11-slim +FROM python:3.11-slim # Create non-root user RUN useradd -m -u 1000 appuser