ubyssey.ca/deployment/compose/docker-compose.staging.yml at aedfea6a238b032e2433caa926ccbc5f2edcac24 · ubyssey/ubyssey.ca · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
version: "3.7"

networks:
  proxy-shared:
    external: true
  ubyssey-internal:
    driver: overlay
    attachable: true

services:
  mysql:
    image: mysql:8.0
    environment:
      MYSQL_DATABASE: ubyssey
      MYSQL_USER: ubyssey
      MYSQL_PASSWORD_FILE: /run/secrets/SQL_PASSWORD
      MYSQL_ROOT_PASSWORD_FILE: /run/secrets/SQL_PASSWORD
    secrets:
       - SQL_PASSWORD
    networks:
      - ubyssey-internal
    volumes:
      - ./mysql-data:/var/lib/mysql

  cache:
    image: memcached:1.6.10
    networks:
      - ubyssey-internal
    ports:
      - "11211:11211"
    healthcheck:
      test: ["CMD", "bash", "-c", "timeout 5 bash -c '</dev/tcp/localhost/11211'"]
      interval: 30s
      timeout: 10s
      retries: 5
      start_period: 10s

  redis:
    restart: always
    image: redis:latest
    networks:
      - ubyssey-internal
    volumes:
      - redis-data:/var/lib/redis
    ports:
      - "6379:6379"

  # WSGI worker for regular HTTP traffic
  django-wsgi:
    image: ghcr.io/ubyssey/ubyssey.ca:${TAG}

    # The number of gunicorn workers below is derived from the number of CPU cores
    # on the VM using this formula: (2 x $num_cores) + 1, where $num_cores is 8.
    #
    # Total workers = (2 x 8) + 1 = 17
    #
    # We then further divide by the number of container replicas (4):
    #
    # Workers per replica = floor(17 / 4) = 4
    #
    # Ref: https://medium.com/@jleonro/finetunne-number-of-workers-in-gunicorn-ab1907b06cae
    command: >
      bash -c "python manage.py migrate && python manage.py crontab add && service cron start
      && gunicorn ubyssey.wsgi:application --workers=4 --bind 0.0.0.0:8000 --access-logfile - --error-logfile -"

    environment:
      SECRET_KEY_FILE: /run/secrets/DJANGO_SECRET_KEY
      SQL_HOST: mysql
      SQL_USER: root
      SQL_PASSWORD_FILE: /run/secrets/SQL_PASSWORD
      SQL_DATABASE: ubyssey
      GS_ACCESS_KEY_ID_FILE: /run/secrets/GS_ACCESS_KEY_ID
      GS_SECRET_ACCESS_KEY_FILE: /run/secrets/GS_SECRET_ACCESS_KEY
      STATIC_URL: https://storage.googleapis.com/ubyssey-staging/static/
      GOOGLE_APPLICATION_CREDENTIALS: /run/secrets/GOOGLE_APPLICATION_CREDENTIALS
      EMAIL_HOST_PASSWORD_FILE: /run/secrets/EMAIL_HOST_PASSWORD

    secrets:
      - DJANGO_SECRET_KEY
      - SQL_PASSWORD
      - GS_ACCESS_KEY_ID
      - GS_SECRET_ACCESS_KEY
      - GOOGLE_APPLICATION_CREDENTIALS
      - EMAIL_HOST_PASSWORD

    networks:
      - proxy-shared
      - ubyssey-internal

    expose:
      - 8000

    # Send container logs to Google Cloud Logging
    # Ref: https://docs.docker.com/engine/logging/drivers/gcplogs/
    logging:
      driver: gcplogs

    deploy:
      # Deploy 4 replicas of the Django app. This makes us more resilient to errors
      # that might cause a single container to crash momentarily.
      #
      # Ref: https://docs.docker.com/reference/compose-file/deploy/#replicas
      mode: replicated
      replicas: 4

      # Restart containers on failure up to a maximum of 10 times.
      # Ref: https://docs.docker.com/reference/compose-file/deploy/#restart_policy
      restart_policy:
        condition: on-failure
        max_attempts: 10
        window: 120s

      # Update 2 containers at a time. This prevents downtime when releasing an update.
      # Ref: https://docs.docker.com/reference/compose-file/deploy/#update_config
      # NOTE: No rollback for staging - we want to see what breaks
      update_config:
        parallelism: 2
        delay: 10s
        order: stop-first
        monitor: 30s
        max_failure_ratio: 0
        failure_action: pause

    depends_on:
      - mysql
      - cache
      - redis

  # ASGI worker for WebSocket traffic at /ws/
  django-asgi:
    image: ghcr.io/ubyssey/ubyssey.ca:${TAG}

    # Use Gunicorn with Uvicorn workers for ASGI/WebSocket support
    #
    # Worker configuration: 2 workers per replica × 2 replicas = 4 total ASGI workers
    #
    # Why 4 ASGI workers?
    # - Total CPU budget: (2 × 8 cores) + 1 = 17 workers for BOTH WSGI + ASGI combined
    # - Traffic split: Majority is HTTP (WSGI), minimal WebSocket traffic (ASGI)
    # - Resource allocation: WSGI handles bulk of requests, ASGI dedicated to WebSocket-only
    # - Each ASGI worker is async and multiplexes many concurrent WebSocket connections
    #
    # Why 2 replicas?
    # - Provides fault tolerance (if 1 replica fails, 50% capacity remains)
    # - Enables zero-downtime updates (update 1 replica at a time)
    #
    # Ref: https://medium.com/@jleonro/finetunne-number-of-workers-in-gunicorn-ab1907b06cae
    command: >
      bash -c "gunicorn ubyssey.asgi:application --workers=2 -k uvicorn.workers.UvicornWorker --bind 0.0.0.0:8001 --access-logfile - --error-logfile -"

    environment:
      SECRET_KEY_FILE: /run/secrets/DJANGO_SECRET_KEY
      SQL_HOST: mysql
      SQL_USER: root
      SQL_PASSWORD_FILE: /run/secrets/SQL_PASSWORD
      SQL_DATABASE: ubyssey
      GS_ACCESS_KEY_ID_FILE: /run/secrets/GS_ACCESS_KEY_ID
      GS_SECRET_ACCESS_KEY_FILE: /run/secrets/GS_SECRET_ACCESS_KEY
      STATIC_URL: https://storage.googleapis.com/ubyssey-staging/static/
      GOOGLE_APPLICATION_CREDENTIALS: /run/secrets/GOOGLE_APPLICATION_CREDENTIALS
      EMAIL_HOST_PASSWORD_FILE: /run/secrets/EMAIL_HOST_PASSWORD

    secrets:
      - DJANGO_SECRET_KEY
      - SQL_PASSWORD
      - GS_ACCESS_KEY_ID
      - GS_SECRET_ACCESS_KEY
      - GOOGLE_APPLICATION_CREDENTIALS
      - EMAIL_HOST_PASSWORD

    networks:
      - proxy-shared
      - ubyssey-internal

    expose:
      - 8001

    # Send container logs to Google Cloud Logging
    # Ref: https://docs.docker.com/engine/logging/drivers/gcplogs/
    logging:
      driver: gcplogs

    deploy:
      # Deploy 2 replicas for WebSocket handling
      mode: replicated
      replicas: 2

      # Restart containers on failure up to a maximum of 10 times.
      # Ref: https://docs.docker.com/reference/compose-file/deploy/#restart_policy
      restart_policy:
        condition: on-failure
        max_attempts: 10
        window: 120s

      # Update 1 container at a time for WebSocket connections
      # Ref: https://docs.docker.com/reference/compose-file/deploy/#update_config
      # NOTE: No rollback for staging - we want to see what breaks
      update_config:
        parallelism: 1
        delay: 10s
        order: stop-first
        monitor: 30s
        max_failure_ratio: 0
        failure_action: pause

    depends_on:
      - mysql
      - cache
      - redis

  nginx:
    image: nginx:1.27
    restart: always
    volumes:
      - ./nginx/:/etc/nginx/:ro
      - ./certbot/www/:/var/www/certbot/:ro
      - ./certbot/letsencrypt-etc/:/etc/letsencrypt/:ro
      - certbot-signals:/tmp/certbot-signals
      - ./scripts/nginx-reload-watcher.sh:/nginx-reload-watcher.sh:ro
    networks:
      - proxy-shared
      - ubyssey-internal
    ports:
      - 80:80
      - 443:443
    logging:
      driver: gcplogs
    # Run nginx and the reload watcher in parallel
    command: >
      sh -c "sh /nginx-reload-watcher.sh & nginx -g 'daemon off;'"

    # Health check to ensure nginx is running and config is valid
    # This prevents bad configs from being deployed silently
    healthcheck:
      test: ["CMD", "nginx", "-t"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 10s

    deploy:
      # Restart containers on failure up to a maximum of 5 times
      restart_policy:
        condition: on-failure
        max_attempts: 5
        window: 120s

      # Rolling update strategy - only update if health check passes
      # NOTE: No rollback for staging - we want to see what breaks
      update_config:
        parallelism: 1
        delay: 10s
        order: stop-first
        monitor: 30s
        max_failure_ratio: 0
        failure_action: pause

    depends_on:
      - django-wsgi
      - django-asgi

  certbot:
    image: certbot/certbot:v4.0.0
    volumes:
      - ./certbot/www/:/var/www/certbot/:rw
      - ./certbot/letsencrypt-etc/:/etc/letsencrypt/:rw
      - ./certbot/letsencrypt-lib/:/var/lib/letsencrypt/:rw
      - certbot-signals:/tmp/certbot-signals
      - ./scripts/certbot-renewal-with-nginx-reload.sh:/certbot-renewal-with-nginx-reload.sh:ro
    networks:
      - ubyssey-internal
    # Run the certbot service script that signals nginx to reload via shared volume
    entrypoint: ["/bin/sh", "/certbot-renewal-with-nginx-reload.sh"]

    deploy:
      # Restart certbot on failure - background service should always run
      restart_policy:
        condition: on-failure
        delay: 5s
        max_attempts: 5

volumes:
  certbot-signals:
    driver: local
  redis-data: {}

secrets:
  DJANGO_SECRET_KEY:
    external: true
  SQL_PASSWORD:
    external: true
  GS_ACCESS_KEY_ID:
    external: true
  GS_SECRET_ACCESS_KEY:
    external: true
  GOOGLE_APPLICATION_CREDENTIALS:
    file: ./service-account.json
  EMAIL_HOST_PASSWORD:
    external: true