diff --git a/supporting-blog-content/building-multimodal-rag-with-elasticsearch-gotham/.dockerignore b/supporting-blog-content/building-multimodal-rag-with-elasticsearch-gotham/.dockerignore deleted file mode 100644 index c93b7331..00000000 --- a/supporting-blog-content/building-multimodal-rag-with-elasticsearch-gotham/.dockerignore +++ /dev/null @@ -1,8 +0,0 @@ -# Ignore everything -** - -# Allow specific files and directories -!requirements.txt -!data/ -!src/ -!stages/ diff --git a/supporting-blog-content/building-multimodal-rag-with-elasticsearch-gotham/Dockerfile b/supporting-blog-content/building-multimodal-rag-with-elasticsearch-gotham/Dockerfile deleted file mode 100644 index 2b22ab0d..00000000 --- a/supporting-blog-content/building-multimodal-rag-with-elasticsearch-gotham/Dockerfile +++ /dev/null @@ -1,36 +0,0 @@ -# Use non-slim image due to OS dependencies of python packages. This gives us -# git, build-essential, libglib2 (opencv) and gomp (torchaudio). -FROM python:3.12 - -COPY /requirements.txt . - -# Our python requirements have some OS dependencies beyond the base layer: -# -# * imagebind pulls in cartopy which has OS dependencies on geos and proj -# * opencv has a runtime OS dependency on libgl1-mesa-glx -# -# The dev dependencies are installed temporarily to compile the wheels. -# We leave the only the runtime dependencies, to keep the image smaller. -RUN apt-get update && \ - # install build and runtime dependencies - apt-get install -y --no-install-recommends \ - libgeos-dev \ - libproj-dev \ - libgeos-c1v5 \ - libproj25 \ - libgl1-mesa-glx && \ - # Install everything except xformers first - grep -v "\bxformers\b" requirements.txt > /tmp/r.txt && pip install -r /tmp/r.txt && \ - # Now, install xformers, as it should be able to see torch now - grep "\bxformers\b" requirements.txt > /tmp/r.txt && pip install -r /tmp/r.txt && \ - # remove build dependencies - apt-get purge -y libgeos-dev libproj-dev && \ - apt-get autoremove -y && \ - rm -rf /var/lib/apt/lists/* - -WORKDIR /app -RUN mkdir -p ./data ./src ./stages -COPY ./data ./data -COPY ./src ./src -COPY ./stages ./stages - diff --git a/supporting-blog-content/building-multimodal-rag-with-elasticsearch-gotham/docker-compose.yml b/supporting-blog-content/building-multimodal-rag-with-elasticsearch-gotham/docker-compose.yml deleted file mode 100644 index fd08c664..00000000 --- a/supporting-blog-content/building-multimodal-rag-with-elasticsearch-gotham/docker-compose.yml +++ /dev/null @@ -1,61 +0,0 @@ -name: gotham-city-crime-analysis - -services: - verify-file-structure: - build: - context: . - container_name: verify-file-structure - restart: 'no' # no need to re-verify file structure - env_file: - - .env - command: python stages/01-stage/files_check.py - extra_hosts: # send localhost traffic to the docker host, e.g. your laptop - - "localhost:host-gateway" - - generate-embeddings: - depends_on: - verify-file-structure: - condition: service_completed_successfully - build: - context: . - container_name: generate-embeddings - restart: 'no' # no need to re-generate embeddings - env_file: - - .env - command: python stages/02-stage/test_embedding_generation.py - extra_hosts: # send localhost traffic to the docker host, e.g. your laptop - - "localhost:host-gateway" - volumes: - - torch-checkpoints:/root/.cache/torch/checkpoints/ - - index-content: - depends_on: - generate-embeddings: - condition: service_completed_successfully - build: - context: . - container_name: index-content - restart: 'no' # no need to re-verify file structure - env_file: - - .env - command: python stages/03-stage/index_all_modalities.py - extra_hosts: # send localhost traffic to the docker host, e.g. your laptop - - "localhost:host-gateway" - - search-and-analyze: - depends_on: - index-content: - condition: service_completed_successfully - build: - context: . - container_name: search-and-analyze - restart: 'no' # no need to re-verify file structure - env_file: - - .env - command: python stages/04-stage/rag_crime_analyze.py - extra_hosts: # send localhost traffic to the docker host, e.g. your laptop - - "localhost:host-gateway" - -volumes: - # Avoid re-downloading a >4GB model checkpoint - torch-checkpoints: diff --git a/supporting-blog-content/building-multimodal-rag-with-elasticsearch-gotham/docker-setup.md b/supporting-blog-content/building-multimodal-rag-with-elasticsearch-gotham/docker-setup.md deleted file mode 100644 index f049b93d..00000000 --- a/supporting-blog-content/building-multimodal-rag-with-elasticsearch-gotham/docker-setup.md +++ /dev/null @@ -1,97 +0,0 @@ -# Building a Multimodal RAG Pipeline with Elasticsearch: The Story of Gotham City - -This repository contains the code for implementing a Multimodal Retrieval-Augmented Generation (RAG) system using Elasticsearch. The system processes and analyzes different types of evidence (images, audio, text, and depth maps) to solve a crime in Gotham City. - -## Overview - -The pipeline demonstrates how to: -- Generate unified embeddings for multiple modalities using ImageBind -- Store and search vectors efficiently in Elasticsearch -- Analyze evidence using GPT-4 to generate forensic reports - -## Prerequisites - -- A Docker runtime with 8GB+ free ram - - GPU is optional, but recommended -- Elasticsearch cluster (cloud or local) -- OpenAI API key - Setup an OpenAI account and create a [secret key](https://platform.openai.com/docs/quickstart) - -## Quick Start - -This example runs four stages as docker compose services: - -```mermaid -graph TD - verify-file-structure --> generate-embeddings - generate-embeddings --> index-content - index-content --> search-and-analyze -``` - -First, copy [env.example](env.example) to `.env` and fill in values noted inside. - - -Edit the `.env` file and fill in your credentials: - -```env -# Elasticsearch Configuration -ELASTICSEARCH_URL="https://your-elasticsearch-endpoint:443" -ELASTICSEARCH_API_KEY="your-api-key" -# If not using API key, uncomment these and fill them in: -# ELASTICSEARCH_USER=elastic -# ELASTICSEARCH_PASSWORD=elastic - -# OpenAI Configuration -OPENAI_API_KEY="your-openai-api-key" -``` - -### 2. Configure Docker Resources - -The ImageBind model requires significant memory. Ensure Docker has enough resources: - -- **Memory**: At least 8GB (16GB recommended) -- **Storage**: At least 10GB free space - -For Docker Desktop users: -1. Open Docker Desktop settings -2. Go to Resources > Advanced -3. Increase memory allocation to at least 8GB -4. Apply & Restart - -### 3. Running the Complete Pipeline - -To run the entire pipeline from file structure verification to evidence analysis: - -```bash -docker compose run --build --rm search-and-analyze -``` - -This command will: -1. Build the Docker image if needed -2. Run each stage in sequence -3. Cache the ImageBind model weights for future runs - -The first run will take longer as it builds the image and downloads model weights (~4.5GB). - -### 4. Running Individual Stages - -If you prefer to run each stage separately: - -```bash -# File structure verification -docker compose run --build --rm verify-file-structure - -# Generate embeddings -docker compose run --build --rm generate-embeddings - -# Index content in Elasticsearch -docker compose run --build --rm index-content - -# Search and analyze evidence -docker compose run --build --rm search-and-analyze -``` - -To skip dependency checks when running a specific stage: - -```bash -docker compose run --no-deps --build --rm search-and-analyze -```