docker-compose.yaml

name: lumigator

services:

  minio:
    labels:
      ai.mozilla.product_name: lumigator
    image: quay.io/minio/minio:RELEASE.2024-12-18T13-15-44Z
    command: server /data --console-address ":9001"
    ports:
      - 9000:9000
      - 9001:9001
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
      interval: 5s
      timeout: 20s
      retries: 18
    environment:
      - MINIO_ROOT_USER
      - MINIO_ROOT_PASSWORD
      - MINIO_API_CORS_ALLOW_ORIGIN
    volumes:
    # - ${HOME}/minio/data:/data
      - minio-data:/data
    profiles:
      - local

  minio-admin:
    labels:
      ai.mozilla.product_name: lumigator
    image: quay.io/minio/minio:RELEASE.2024-12-18T13-15-44Z
    depends_on:
      minio:
        condition: service_healthy
    entrypoint:
      - /bin/bash
      - -c
      - |
          set -ex
          mc alias set lumigator_s3 http://minio:9000 ${MINIO_ROOT_USER} ${MINIO_ROOT_PASSWORD}
          mc admin user add lumigator_s3 lumigator lumigator
          mc admin policy attach lumigator_s3 readwrite --user lumigator
          mc mb -p lumigator_s3/lumigator-storage
    extra_hosts:
      - "localhost:host-gateway"
    profiles:
      - local

  redis:
    labels:
      ai.mozilla.product_name: lumigator
    image: redis:8.0-M03-alpine
    command: redis-server --save 60 1 --loglevel warning
    profiles:
      - local
    volumes:
      - redis-data:/data
    healthcheck:
      test: ["CMD-SHELL", "redis-cli ping"]
      interval: 1s
      timeout: 3s
      retries: 5

  ray:
    labels:
      ai.mozilla.product_name: lumigator
    image: rayproject/ray:2.30.0-py311${COMPUTE_TYPE}${RAY_ARCH_SUFFIX}
    depends_on:
      redis:
        condition: service_healthy
    ports:
      - "6379:6379"
      - "${RAY_DASHBOARD_PORT}:${RAY_DASHBOARD_PORT}"
      - "10001:10001"
    # https://docs.ray.io/en/releases-2.30.0/cluster/cli.html#ray-start for more info about the command
    # Apparently dead head nodes can be selected unless
    # RAY_JOB_ALLOW_DRIVER_ON_WORKER_NODES is set
    # Dead head nodes appear because of the GCS data being
    # persisted in Redis
    # https://github.com/ray-project/ray/issues/32167
    entrypoint:
      - /bin/bash
      - -c
      - |
          set -eaux
          # If the file was mounted in a volume instead of
          # a shared dir, ownership needs to be changed
          # ... || true allows this to fail (-e is set)
          sudo chown ray:users /home/ray/.cache/huggingface/hub || true
          sudo chown ray:users /tmp/ray_pip_cache || true
          RAY_JOB_ALLOW_DRIVER_ON_WORKER_NODES=1 RAY_REDIS_ADDRESS=redis:6379 ray start --head --dashboard-port=${RAY_DASHBOARD_PORT} --port=6379 --dashboard-host=0.0.0.0 --ray-client-server-port 10001
          mkdir -p /tmp/ray/session_latest/runtime_resources/pip
          rmdir /tmp/ray/session_latest/runtime_resources/pip/ && ln -s /tmp/ray_pip_cache /tmp/ray/session_latest/runtime_resources/pip
          sleep infinity
    shm_size: 2g
    volumes:
      - ${HF_HOME}:/home/ray/.cache/huggingface
      - ray-pip-cache:/tmp/ray_pip_cache
    deploy:
      resources:
        limits:
          cpus: '2'
          memory: '10g'
    environment:
      # LOCAL_FSSPEC_S3 env vars required by s3fs running inside evaluator ray jobs
      - LOCAL_FSSPEC_S3_ENDPOINT_URL=${AWS_ENDPOINT_URL} # Should match AWS_ENDPOINT_URL
      - LOCAL_FSSPEC_S3_KEY=${AWS_ACCESS_KEY_ID} # Should match AWS_SECRET_ACCESS_KEY
      - LOCAL_FSSPEC_S3_SECRET=${AWS_SECRET_ACCESS_KEY} # Should match AWS_SECRET_ACCESS_KEY
      - AWS_ACCESS_KEY_ID
      - AWS_SECRET_ACCESS_KEY
      - AWS_DEFAULT_REGION
      - AWS_ENDPOINT_URL

    # NOTE: to keep AWS_ENDPOINT_URL as http://localhost:9000 both on the host system
    #       and inside containers, we map localhost to the host gateway IP.
    #       This currently works properly, but might be the cause of networking
    #       issues down the line. This should be used only for local, development
    #       deployments.
    extra_hosts:
      - "localhost:host-gateway"
    profiles:
      - local
    healthcheck:
      test: ["CMD", "ray", "status"]
      interval: 3s
      timeout: 3s
      retries: 5

  backend:
    labels:
      ai.mozilla.product_name: lumigator
    image: mzdotai/lumigator:latest
    pull_policy: always
    build:
      context: .
      dockerfile: "Dockerfile"
      target: "main_image"
    depends_on:
      minio-admin:
        condition: service_completed_successfully
      minio:
        condition: "service_started"
        required: false
      ray:
        condition: "service_healthy"
        required: false
      mlflow:
        condition: "service_started"
        required: false
    ports:
      - 8000:8000
    environment:
      - LUMIGATOR_SECRET_KEY # Symmetric key used for encryption/decryption of stored secret data
      - DEPLOYMENT_TYPE
      # The local file needs to be available through a mount,
      # if persistence is needed
      - SQLALCHEMY_DATABASE_URL
      - S3_ENDPOINT_URL=${AWS_ENDPOINT_URL}
      - AWS_ACCESS_KEY_ID
      - AWS_SECRET_ACCESS_KEY
      - AWS_DEFAULT_REGION
      - AWS_ENDPOINT_URL
      - S3_BUCKET
      # TODO: the following two rows should be renamed to EVALUATOR_*
      #       and the two above should be removed when we depreate evaluator
      - EVALUATOR_PIP_REQS
      - EVALUATOR_WORK_DIR
      - INFERENCE_PIP_REQS
      - INFERENCE_WORK_DIR
      - RAY_DASHBOARD_PORT
      - RAY_HEAD_NODE_HOST
      - RAY_WORKER_GPUS
      - RAY_WORKER_GPUS_FRACTION
      - LUMIGATOR_API_CORS_ALLOWED_ORIGINS
      - MLFLOW_TRACKING_URI
    # NOTE: to keep AWS_ENDPOINT_URL as http://localhost:9000 both on the host system
    #       and inside containers, we map localhost to the host gateway IP.
    #       This currently works properly, but might be the cause of networking
    #       issues down the line. This should be used only for local, development
    #       deployments.
    extra_hosts:
      - "localhost:host-gateway"
    volumes:
      - database-data:/db-data


  frontend:
    labels:
      ai.mozilla.product_name: lumigator
    pull_policy: always
    image: mzdotai/lumigator-frontend:latest
    build:
      context: .
      dockerfile: "./lumigator/frontend/Dockerfile"
      target: "server"
      args:
       VUE_APP_BASE_URL: http://localhost/api/v1/
    environment:
      LUMIGATOR_API_PORT: 8000
      LUMIGATOR_API_HOST: backend
    volumes:
      - ./lumigator/frontend/nginx/:/etc/nginx/templates/
    depends_on:
      backend:
        condition: "service_started"
        required: true
    ports:
      - 80:80

  mlflow:
    labels:
      ai.mozilla.product_name: lumigator
    image: ghcr.io/mlflow/mlflow:v2.20.3
    environment:
      - AWS_ACCESS_KEY_ID
      - AWS_SECRET_ACCESS_KEY
      - AWS_DEFAULT_REGION
      - AWS_ENDPOINT_URL
      - MLFLOW_TRACKING_URI
      - MLFLOW_DATABASE_URL
      - MLFLOW_S3_ROOT_PATH
    ports:
      - "8001:5000"
    depends_on:
      minio:
        condition: service_healthy
    command:
      - sh
      - -c
      - |
        pip install --upgrade pip > /dev/null 2>&1
        pip install --no-cache-dir mlflow[extras] > /dev/null 2>&1
        mlflow server --backend-store-uri ${MLFLOW_DATABASE_URL} --default-artifact-root ${MLFLOW_S3_ROOT_PATH} --host 0.0.0.0
    extra_hosts:
      - "localhost:host-gateway"
    profiles:
      - local
    volumes:
      - mlflow-database-data:/db-data

volumes:
    minio-data:
      labels:
        ai.mozilla.product_name: lumigator
    database-data:
      labels:
        ai.mozilla.product_name: lumigator
    redis-data:
      labels:
        ai.mozilla.product_name: lumigator
    ray-pip-cache:
      labels:
        ai.mozilla.product_name: lumigator
    mlflow-database-data:
      labels:
        ai.mozilla.product_name: lumigator