Offline_AI_Chatbot/docker-compose.yml at bunOllama · musemod/Offline_AI_Chatbot · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
services:
  # Frontend service (Vite)
  frontend:
    build:
      context: .
      target: dev
    ports:
      - "5173:5173"
    environment:
      - BUN_ENV=development
      - VITE_API_URL=http://localhost:3000
    command: ["bunx", "vite", "--host", "0.0.0.0"]
    develop:
      watch:
        - action: sync
          path: ./src/client
          target: /app/src/client
        - action: sync
          path: ./src/assets
          target: /app/src/assets
        - action: sync
          path: ./public
          target: /app/public
        - action: sync
          path: ./index.html
          target: /app/index.html
        - action: sync
          path: ./vite.config.ts
          target: /app/vite.config.ts
        - action: rebuild
          path: package.json
    networks:
      - app-network

  # --- AI Model Service (Ollama) --
  ollama:
    image: ollama/ollama:0.16.3 # This official Docker ollama image includes all necessary CUDA libraries for GPU support. Pulling latest version can cause changes in how certain models are run.
    #  You can try lightweight but unofficial CPU-only version alpine/ollama ((https://hub.docker.com/r/alpine/ollama) which is only around 70 MB.
    container_name: ollama-shared # rename as needed
    ports:
      - "11434:11434"
    volumes:
      - ${OLLAMA_MODELS_PATH:-/home/cynth/ollama_models_shared}:/root/.ollama/models # Docker volume to bind mount pointing to a local shared models folder
    # - ollama_models:/root/.ollama # persists downloaded models
    # NOTE: we should remove hard-coded models path and put env variable instead
    init: true # handles zombie processes, recommended for containers
    deploy: # deploy (lines 46-52) is more modern syntax vs. runtime: nvidia (legacy) but does the same thing
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all # if you have multiple GPUs
              capabilities: [gpu]
    environment:
      - OLLAMA_HOST=0.0.0.0 # listen on all interfaces
      - OLLAMA_PORT=11434
      - OLLAMA_NUM_PARALLEL=4 # number of parallel requests the model can handle
      - OLLAMA_MAX_LOADED_MODELS=2 # max models kept in memory simultaneously
      - OLLAMA_KEEP_ALIVE=-1 # keep models loaded forever (-1 = infinite)
      - OLLAMA_DEBUG=0
      - OLLAMA_CONTEXT_LENGTH=8192 # max context window size in tokens (affects VRAM)
      - NVIDIA_VISIBLE_DEVICES=all # which GPUs to expose to container
      - NVIDIA_DRIVER_CAPABILITIES=compute,utility # required GPU capabilities (ensures neccesary CUDA drivers loaded inside container)
    restart: unless-stopped
    healthcheck:
      test: [ "CMD", "ollama", "list" ] # checks if Ollama is responding
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 20s # gives Ollama time to start up
    networks:
      - app-network

  # --- Backend Service ---
  backend:
    build:
      context: .
      target: dev
    ports:
      - "3000:3000"
    volumes:
      - ./src/server/aiTest:/app/src/server/aiTest
    environment:
      - DATABASE_URL=postgresql://root:root@db:5432/test_db
      - NODE_ENV=development
      - PORT=3000
      - DB_HOST=db
      - DB_PORT=5432
      - DB_NAME=test_db
      - DB_USER=root
      - DB_PASSWORD=root
      - MODEL_URL=${MODEL_URL:-http://ollama:11434/v1/chat/completions} # Ollama OpenAI-compatible endpoint
      - TEXT2SQL_MODEL=${TEXT2SQL_MODEL:-arctic-text2sql:latest} # alternative model option
      # - TEXT2SQL_MODEL=${TEXT2SQL_MODEL:-distil-qwen3-4b:latest} # model for text-to-SQL tasks
      - AI_RESPONSE_MODEL=${AIRESP_MODEL:-qwen2.5-coder:7b} # main AI response model
      - JUDGE_MODEL=${JUDGE_MODEL:-qwen2.5-coder:7b} # model for evaluating/validating responses
      # - JUDGE_MODEL=${JUDGE_MODEL:-qwen2.5-coder:14b} # larger judge model option (needs more VRAM)
      - MODEL_REGISTRY_URL=http://ollama:11434/api/tags
      - HARDWARE_TIER=4090-workstation

    command: ["bun", "run", "server"]
    develop:
      watch:
        - action: sync+restart
          path: ./src/server
          target: /app/src/server
          ignore:
            - "**/*.log"
            - "**/*.tmp"
        - action: sync+restart
          path: ./src/shared
          target: /app/src/shared
        - action: rebuild
          path: package.json
    depends_on:
      db:
        condition: service_started
      ollama:
        condition: service_healthy
    networks:
      - app-network

  # --- PostgreSQL Database ---
  db:
    image: postgres:16-alpine
    restart: always
    shm_size: 128mb
    environment:
      POSTGRES_USER: root
      POSTGRES_PASSWORD: root # NOTE: we'd remove these hard-coded passwords (since not really secure)
      POSTGRES_DB: test_db
    ports:
      - "5432:5432"
    volumes:
      - postgres_offline_data:/var/lib/postgresql/data
    networks:
      - app-network

  # --- pgAdmin GUI ---
  pgadmin:
    container_name: pgadmin4_container
    image: dpage/pgadmin4
    restart: always
    environment:
      PGADMIN_DEFAULT_EMAIL: admin@admin.com
      PGADMIN_DEFAULT_PASSWORD: root
    ports:
      - "5050:80"
    depends_on:
      - db
    volumes:
      - pgadmin_offline_data:/var/lib/pgadmin
    networks:
      - app-network

# --- Networks ---
networks:
  app-network:
    driver: bridge

# --- Volumes ---
volumes:
  postgres_offline_data:
  pgadmin_offline_data: