local-llm-container/docker-compose.yml at main · ClinicianFOCUS/local-llm-container · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
services:
  # Ollama service configuration - Main LLM service
  ollama:
    build:
      context: . # Build context is the current directory
      dockerfile: Dockerfile.ollama # Use the Ollama-specific Dockerfile
      no_cache: true # Disable caching during the build process
    user: "appuser:appgroup" # Run as user:group 1000:1000
    container_name: ollama # Set a specific name for the container
    restart: unless-stopped # Automatically restart container unless manually stopped
    environment:
      - OLLAMA_KEEP_ALIVE=-1 # Keep the service running indefinitely
      - NVIDIA_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES:-all} # Configure GPU visibility, defaults to all
      - OLLAMA_CONCURRENT_REQUESTS=${OLLAMA_CONCURRENT_REQUESTS:-1} # Number of concurrent requests, defaults to 1
      - OLLAMA_QUEUE_ENABLED=${OLLAMA_QUEUE_ENABLED:-true} # Enable request queuing, defaults to true
      - OLLAMA_CONTEXT_LENGTH=${OLLAMA_CONTEXT_LENGTH:-8192}
    networks:
      - ollama_network # Connect to the ollama_network bridge network
    entrypoint: ["/usr/bin/bash", "/ollama-entrypoint.sh"]
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia # Specify NVIDIA as the GPU driver
              count: all # Use all available GPUs
              capabilities: [gpu] # Enable GPU capabilities

  # FastAPI wrapper service configuration - API interface for Ollama
  # No need for env on the port and host as it runs internal on our docker network
  fastapi-wrapper:
    build:
      context: . # Build context is the current directory
      dockerfile: Dockerfile.wrapper # Use the wrapper-specific Dockerfile
      no_cache: true # Disable caching during the build process
    user: "appuser:appgroup" # Run as user:group 1000:1000
    container_name: authentication-ollama # Set container name
    restart: unless-stopped # Automatic restart policy
    environment:
      - PYTHONUNBUFFERED=1 # Enable unbuffered Python output
      - SESSION_API_KEY=${SESSION_API_KEY:-} # Optional API key for authentication
    depends_on:
      - ollama # Ensure Ollama service starts first
    command: "uvicorn api_wrapper:app --host 0.0.0.0 --port 5000 --log-level debug" # Start FastAPI server
    networks:
      - ollama_network # Connect to the same network as Ollama

  # Caddy service configuration - Reverse proxy and HTTPS
  caddy:
    container_name: caddy-ollama # Container name for Caddy service
    restart: unless-stopped # Automatic restart policy
    user: "appuser:appgroup" # Run as user:group 1000:1000
    build:
      context: . # Build context is current directory
      dockerfile: Dockerfile.caddy # Use Caddy-specific Dockerfile
      no_cache: true # Disable caching during the build process
    environment:
      - PUBLIC_ACCESS_PORT=${PUBLIC_ACCESS_PORT:-3334} # The public access port we are using
    ports:
      # Expose the service port, configurable via PUBLIC_ACCESS_PORT env var
      - ${PUBLIC_ACCESS_PORT:-3334}:${PUBLIC_ACCESS_PORT:-3334}
    networks:
      - ollama_network # Connect to the same network as other services

# Network configuration
networks:
  ollama_network:
    driver: bridge # Create a bridge network for container communication