medevs
diff --git a/‎.github/workflows/deploy.yml‎
Lines changed: 7 additions & 0 deletions b/‎.github/workflows/deploy.yml‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎backend/Dockerfile‎
Lines changed: 63 additions & 22 deletions b/‎backend/Dockerfile‎
Lines changed: 63 additions & 22 deletions
diff --git a/‎backend/app/config.py‎
Lines changed: 6 additions & 0 deletions b/‎backend/app/config.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎backend/app/routers/chat.py‎
Lines changed: 32 additions & 68 deletions b/‎backend/app/routers/chat.py‎
Lines changed: 32 additions & 68 deletions
@@ -87,6 +87,13 @@ jobs:
           echo "📋 Checking ollama-init logs..."
           docker compose -f docker-compose.yml -f docker-compose.homelab.yml --profile observability logs ollama-init || true
 
+          echo "🗄️ Running database migrations..."
+          # Wait for backend to be healthy before running migrations
+          sleep 10
+          docker compose -f docker-compose.yml -f docker-compose.homelab.yml exec -T backend alembic upgrade head || {
+            echo "⚠️ Migration failed, backend may need restart"
+          }
+
           echo "🧹 Cleaning up old images..."
           docker image prune -f
 
 
@@ -1,65 +1,106 @@
 # =============================================================================
-# Backend Dockerfile - FastAPI Application (Optimized - No ML dependencies)
+# Backend Dockerfile - FastAPI Application
 # =============================================================================
-# Multi-stage build for minimal production image
-# Embeddings are now handled by Ollama, reducing image size significantly
+# Two-stage build with Docling support (optional via USE_DOCLING env var)
+# First build takes ~10-15 minutes, subsequent builds use cache (~2-3 min)
 # =============================================================================
 
 FROM python:3.11-slim-bookworm AS builder
 
 WORKDIR /app
 
-# Install only essential build dependencies
+# Install build dependencies (including those needed for Docling)
 RUN apt-get update && apt-get install -y --no-install-recommends \
     build-essential \
     gcc \
+    g++ \
     && rm -rf /var/lib/apt/lists/*
 
-# Copy requirements first for better layer caching
+# Copy requirements
 COPY requirements.txt .
 
-# Install Python dependencies with optimizations
-RUN pip install --no-cache-dir --user \
-    --prefer-binary \
-    -r requirements.txt
+# =============================================================================
+# Layer 1: Core dependencies (fast, ~2 min)
+# =============================================================================
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install --user --prefer-binary \
+    fastapi==0.115.0 \
+    uvicorn[standard]==0.32.0 \
+    sqlalchemy>=2.0.0 \
+    asyncpg>=0.29.0 \
+    alembic>=1.13.0 \
+    chromadb>=0.5.0 \
+    httpx>=0.27.0 \
+    pydantic>=2.9.0 \
+    pydantic-settings>=2.5.0 \
+    loguru>=0.7.0 \
+    python-multipart==0.0.12 \
+    python-dotenv==1.0.1 \
+    celery>=5.3.0 \
+    redis>=5.0.0 \
+    langchain>=0.3.0 \
+    langchain-community>=0.3.0 \
+    langchain-text-splitters>=0.3.0 \
+    rank-bm25>=0.2.2 \
+    pypdf>=5.0.0 \
+    python-docx>=1.1.0 \
+    markdown>=3.6 \
+    aiofiles>=24.0.0 \
+    psutil>=5.9.0 \
+    slowapi>=0.1.9 \
+    langfuse>=2.0.0 \
+    psycopg2-binary>=2.9.9
+
+# =============================================================================
+# Layer 2: Docling dependencies (slow, ~8-10 min, cached after first build)
+# =============================================================================
+RUN --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/huggingface \
+    pip install --user --prefer-binary \
+    transformers>=4.47.0 \
+    docling>=2.14.0 \
+    docling-core>=2.4.0
 
 # =============================================================================
-# Production stage - Minimal image
+# Production stage
 # =============================================================================
 FROM python:3.11-slim-bookworm
 
 WORKDIR /app
 
-# Install only runtime dependencies
+# Install runtime dependencies (Tesseract for Docling OCR, curl for health check)
 RUN apt-get update && apt-get install -y --no-install-recommends \
     curl \
-    && rm -rf /var/lib/apt/lists/* \
-    && rm -rf /var/cache/apt/*
+    tesseract-ocr \
+    tesseract-ocr-eng \
+    libgl1 \
+    libglib2.0-0 \
+    && rm -rf /var/lib/apt/lists/*
 
 # Copy Python dependencies from builder
 COPY --from=builder /root/.local /root/.local
-
-# Ensure scripts in .local are usable
 ENV PATH=/root/.local/bin:$PATH
 
-# Set Python environment variables for production
+# Set Python environment variables
 ENV PYTHONDONTWRITEBYTECODE=1 \
     PYTHONUNBUFFERED=1 \
-    PYTHONOPTIMIZE=1
+    HF_HOME=/app/data/huggingface \
+    TRANSFORMERS_CACHE=/app/data/huggingface
 
 # Copy application code
 COPY app/ ./app/
 
+# Copy Alembic for database migrations
+COPY alembic.ini ./
+COPY alembic/ ./alembic/
+
 # Create directories for data persistence
-RUN mkdir -p /app/data/chroma_db /app/data/documents /app/logs \
+RUN mkdir -p /app/data/chroma_db /app/data/documents /app/logs /app/data/huggingface \
     && chmod -R 755 /app/data /app/logs
 
-# Expose port
 EXPOSE 8000
 
-# Health check - reduced start period since no ML model loading
-HEALTHCHECK --interval=30s --timeout=10s --start-period=10s --retries=3 \
+HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=3 \
     CMD curl -f http://localhost:8000/health || exit 1
 
-# Run the application
 CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
@@ -42,6 +42,12 @@ class Settings(BaseSettings):
     chunk_size: int = Field(default=500)
     chunk_overlap: int = Field(default=50)
     top_k_results: int = Field(default=3)
+
+    # Document Processing Settings
+    use_docling: bool = Field(
+        default=False,
+        description="Use Docling for advanced document processing (slower but better quality)"
+    )
 
     # Embedding Model (Ollama embedding model)
     embedding_model: str = Field(default="nomic-embed-text")
 
@@ -17,59 +17,44 @@
 router = APIRouter(prefix="/chat", tags=["Chat"])
 
 
+from app.agent.core import Agent
+
 @router.post("", response_model=ChatResponse)
 @router.post("/", response_model=ChatResponse)
 @limiter.limit(get_chat_limit)
 async def chat(request: Request, chat_request: ChatRequest) -> ChatResponse:
     """
-    Send a message and get an AI response.
-    Uses RAG to retrieve relevant context from the knowledge base.
-
-    Args:
-        request: FastAPI Request object (for rate limiting)
-        chat_request: ChatRequest with message and optional history
-
-    Returns:
-        ChatResponse with AI response and sources
+    Send a message and get an AI response using Agentic RAG.
     """
     logger.info(f"Chat request: {chat_request.message[:100]}...")
     settings = get_settings()
-    obs = get_observability_service()
-
+    
     try:
-        rag = get_rag_service()
-
         # Convert history to dict format
         history = None
         if chat_request.history:
             history = [
                 {"role": msg.role, "content": msg.content}
                 for msg in chat_request.history
             ]
-
-        # Trace the LLM call with Langfuse
-        with obs.trace_llm_call(
-            name="chat",
-            input_text=chat_request.message,
-            model=settings.ollama_model,
-            metadata={"endpoint": "/chat", "has_history": bool(history)}
-        ) as ctx:
-            result = await rag.query(
-                question=chat_request.message,
-                history=history
-            )
-            ctx.set_output(result["response"])
-
-        logger.info(f"Chat response generated, sources: {result.get('sources', [])}")
-
+            
+        agent = Agent()
+        response = await agent.run(chat_request.message, history=history)
+        
+        # Parse response to extract sources if possible, or Agent should return them.
+        # Currently Agent returns string.
+        # rag.txt says "cite sources using metadata".
+        # So the sources should be IN the text.
+        # But ChatResponse model expects `sources` list.
+        # We can extract them or just leave empty for now, as the text contains citations.
+        
         return ChatResponse(
-            response=result["response"],
-            sources=result.get("sources", [])
+            response=response,
+            sources=[] # Sources are embedded in the text citation
         )
 
     except Exception as e:
         logger.error(f"Chat error: {e}")
-        obs.log_error(trace_id=None, error=e, context={"endpoint": "/chat"})
         raise HTTPException(
             status_code=500,
             detail="Failed to generate response"
@@ -80,25 +65,14 @@ async def chat(request: Request, chat_request: ChatRequest) -> ChatResponse:
 @limiter.limit(get_chat_stream_limit)
 async def chat_stream(request: Request, chat_request: ChatRequest):
     """
-    Send a message and get a streaming AI response.
-    Uses Server-Sent Events (SSE) for real-time streaming.
-
-    Args:
-        request: FastAPI Request object (for rate limiting)
-        chat_request: ChatRequest with message and optional history
-
-    Returns:
-        StreamingResponse with SSE events
+    Send a message and get a streaming AI response using Agentic RAG.
+    Note: Currently sends the full response as a single chunk after processing.
     """
     logger.info(f"Streaming chat request: {chat_request.message[:100]}...")
     settings = get_settings()
-    obs = get_observability_service()
 
     async def generate():
-        full_response = []
         try:
-            rag = get_rag_service()
-
             # Convert history to dict format
             history = None
             if chat_request.history:
@@ -107,34 +81,24 @@ async def generate():
                     for msg in chat_request.history
                 ]
 
-            # Start trace for streaming
-            with obs.trace_llm_call(
-                name="chat_stream",
-                input_text=chat_request.message,
-                model=settings.ollama_model,
-                metadata={"endpoint": "/chat/stream", "has_history": bool(history)}
-            ) as ctx:
-                async for chunk_data in rag.query_stream(
-                    question=chat_request.message,
-                    history=history
-                ):
-                    # Collect response for tracing
-                    if chunk_data.get("chunk"):
-                        full_response.append(chunk_data["chunk"])
-
-                    # Format as SSE event
-                    event_data = json.dumps(chunk_data)
-                    yield f"data: {event_data}\n\n"
-
-                # Set full response for trace
-                ctx.set_output("".join(full_response))
+            agent = Agent()
+            # Agent processing (plan -> execute -> merge -> generate)
+            response = await agent.run(chat_request.message, history=history)
+            
+            # Yield the full response as a single chunk
+            # In the future, we could stream tokens from Ollama in the final step of Agent
+            chunk_data = {
+                "chunk": response,
+                "done": False,
+                "sources": [] # Sources embedded in text
+            }
+            yield f"data: {json.dumps(chunk_data)}\n\n"
 
             # Send done event
             yield "data: [DONE]\n\n"
 
         except Exception as e:
             logger.error(f"Streaming error: {e}")
-            obs.log_error(trace_id=None, error=e, context={"endpoint": "/chat/stream"})
             error_data = json.dumps({"error": str(e)})
             yield f"data: {error_data}\n\n"
 
@@ -144,7 +108,7 @@ async def generate():
         headers={
             "Cache-Control": "no-cache",
             "Connection": "keep-alive",
-            "X-Accel-Buffering": "no",  # Disable nginx buffering
+            "X-Accel-Buffering": "no",
         }
     )