Skip to content

Commit

Permalink
polish-and-docker
Browse files Browse the repository at this point in the history
Signed-off-by: Adrian Cole <[email protected]>
  • Loading branch information
codefromthecrypt committed Feb 26, 2025
1 parent 4182f10 commit 76475fa
Show file tree
Hide file tree
Showing 18 changed files with 359 additions and 512 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Ignore everything
**

# Allow specific files and directories
!requirements.txt
!data/
!src/
!stages/
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
# Elasticsearch Configuration
ELASTIC_API_KEY=your_api_key_here
ELASTICSEARCH_ENDPOINT=your_elastic_endpoint
# Make a copy of this file with the name .env and assign values to variables

# How you connect to Elasticsearch: change details to your instance
ELASTICSEARCH_URL=
ELASTICSEARCH_API_KEY=
# If not using API key, uncomment these and fill them in:
# ELASTICSEARCH_USER=elastic
# ELASTICSEARCH_PASSWORD=elastic

# OpenAI Configuration
OPENAI_API_KEY=your_openai_api_key_here
OPENAI_API_KEY=

# Model Configuration
MODEL_PATH=~/.cache/torch/checkpoints/imagebind_huge.pth

# Optional Configuration
#LOG_LEVEL=INFO
#DEBUG=False
# LOG_LEVEL=INFO
# DEBUG=False
Original file line number Diff line number Diff line change
@@ -1,20 +1,36 @@
FROM ubuntu:24.04
# Use non-slim image due to OS dependencies of python packages. This gives us
# git, build-essential, libglib2 (opencv) and gomp (torchaudio).
FROM python:3.12

# Install necessary packages
RUN apt update && apt install -y --no-install-recommends \
python3 \
python3-pip \
python3-venv \
g++ \
gcc \
python3.12-dev
COPY /requirements.txt .

# Create and activate a virtual environment
RUN python3 -m venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
# Our python requirements have some OS dependencies beyond the base layer:
#
# * imagebind pulls in cartopy which has OS dependencies on geos and proj
# * opencv has a runtime OS dependency on libgl1-mesa-glx
#
# The dev dependencies are installed temporarily to compile the wheels.
# We leave the only the runtime dependencies, to keep the image smaller.
RUN apt-get update && \
# install build and runtime dependencies
apt-get install -y --no-install-recommends \
libgeos-dev \
libproj-dev \
libgeos-c1v5 \
libproj25 \
libgl1-mesa-glx && \
# Install everything except xformers first
grep -v "\bxformers\b" requirements.txt > /tmp/r.txt && pip install -r /tmp/r.txt && \
# Now, install xformers, as it should be able to see torch now
grep "\bxformers\b" requirements.txt > /tmp/r.txt && pip install -r /tmp/r.txt && \
# remove build dependencies
apt-get purge -y libgeos-dev libproj-dev && \
apt-get autoremove -y && \
rm -rf /var/lib/apt/lists/*

WORKDIR /app
RUN mkdir -p ./data ./src ./stages
COPY ./data ./data
COPY ./src ./src
COPY ./stages ./stages

# Install Python packages in the virtual environment
RUN pip install --upgrade pip
RUN pip install torch
RUN pip install wheel setuptools
RUN pip install transformers xformers
Original file line number Diff line number Diff line change
Expand Up @@ -11,67 +11,34 @@ The pipeline demonstrates how to:

## Prerequisites

- Python 3.10+
- A Docker runtime with 8GB+ free ram
- GPU is optional, but recommended
- Elasticsearch cluster (cloud or local)
- OpenAI API key - Setup an OpenAI account and create a [secret key](https://platform.openai.com/docs/quickstart)
- 8GB+ RAM
- GPU (optional but recommended)

## Quick Start

1. **Setup Environment**
```bash
rm -rf .venv requirements.txt
python3 -m venv .venv
source .venv/bin/activate
pip install pip-tools
# Recreate requirements.txt
pip-compile
# Install main dependencies
pip install -r requirements.txt



python3 -m venv .venv
source .venv/bin/activate
pip install "python-dotenv[cli]"
pip install -r requirements-torch.txt
pip install -r requirements.txt

# Make sure you have pytorch installed and Python 3.10+
pip install torch torchvision torchaudio

# Create and activate virtual environment
python -m venv env_mmrag
source env_mmrag/bin/activate # Unix/MacOS
# or
.\env_mmrag\Scripts\activate # Windows

# Install dependencies
pip install -r requirements.txt
```
This example runs four stages as docker compose services:

2. **Configure Credentials**
Create a `.env` file:
```env
ELASTICSEARCH_ENDPOINT="your-elasticsearch-endpoint"
ELASTIC_API_KEY="your-elastic-api-key"
OPENAI_API_KEY="your-openai-api-key"
```mermaid
graph TD
verify-file-structure --> generate-embeddings
generate-embeddings --> index-content
index-content --> search-and-analyze
```

3. **Run the Demo**
```bash
# Verify file structure
python stages/01-stage/files_check.py
First, copy [env.example](env.example) to `.env` and fill in values noted inside.

# Generate embeddings
python stages/02-stage/test_embedding_generation.py
Now, enter below to run the pipeline:
```bash
docker compose run --build --rm search-and-analyze
```

# Index content
python stages/03-stage/index_all_modalities.py
The first time takes a while to build the image and download ImageBind weights.

# Search and analyze
python stages/04-stage/rag_crime_analyze.py
If you want to re-run just one stage, add `--no-deps` like this:
```bash
docker compose run --no-deps --build --rm search-and-analyze
```

## Project Structure
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
name: gotham-city-crime-analysis

services:
verify-file-structure:
build:
context: .
container_name: verify-file-structure
restart: 'no' # no need to re-verify file structure
env_file:
- .env
command: python stages/01-stage/files_check.py
extra_hosts: # send localhost traffic to the docker host, e.g. your laptop
- "localhost:host-gateway"

generate-embeddings:
depends_on:
verify-file-structure:
condition: service_completed_successfully
build:
context: .
container_name: generate-embeddings
restart: 'no' # no need to re-generate embeddings
env_file:
- .env
command: python stages/02-stage/test_embedding_generation.py
extra_hosts: # send localhost traffic to the docker host, e.g. your laptop
- "localhost:host-gateway"
volumes:
- torch-checkpoints:/root/cache/torch/checkpoints/

index-content:
depends_on:
generate-embeddings:
condition: service_completed_successfully
build:
context: .
container_name: index-content
restart: 'no' # no need to re-verify file structure
env_file:
- .env
command: python stages/03-stage/index_all_modalities.py
extra_hosts: # send localhost traffic to the docker host, e.g. your laptop
- "localhost:host-gateway"

search-and-analyze:
depends_on:
index-content:
condition: service_completed_successfully
build:
context: .
container_name: search-and-analyze
restart: 'no' # no need to re-verify file structure
env_file:
- .env
command: python stages/04-stage/rag_crime_analyze.py
extra_hosts: # send localhost traffic to the docker host, e.g. your laptop
- "localhost:host-gateway"

volumes:
# Avoid re-downloading a >4GB model checkpoint
torch-checkpoints:

This file was deleted.

Loading

0 comments on commit 76475fa

Please sign in to comment.