fork/deploy/docker-compose.yml at main · shaharmor98/fork · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# IMPORT NOTE: Make sure this is in sync with lib/runtime/docker-compose.yml
networks:
  server:
    driver: bridge
  monitoring:
    driver: bridge

# Note that the images are pinned to specific versions to avoid breaking changes.
services:
  nats-server:
    image: nats:2.11.4
    command: [ "-js", "--trace", "-m", "8222" ]
    ports:
      - 4222:4222
      - 6222:6222
      - 8222:8222  # the endpoints include /varz, /healthz, ...
    networks:
      - server
      - monitoring

  etcd-server:
    image: bitnamilegacy/etcd:3.6.1
    environment:
      - ALLOW_NONE_AUTHENTICATION=yes
    ports:
      - 2379:2379  # this port exposes the /metrics endpoint
      - 2380:2380
    networks:
      - server
      - monitoring

  # All the services below are part of the metrics profile and monitoring network.

  # The exporter translates from /varz and other stats to Prometheus metrics
  nats-prometheus-exporter:
    image: natsio/prometheus-nats-exporter:0.17.3
    command: ["-varz", "-connz", "-routez", "-subz", "-gatewayz", "-leafz", "-jsz=all", "http://nats-server:8222"]
    ports:
      - 7777:7777
    networks:
      - monitoring
    profiles: [metrics]
    depends_on:
      - nats-server

  # DCGM stands for Data Center GPU Manager: https://developer.nvidia.com/dcgm
  # dcgm-exporter is a tool from NVIDIA that exposes DCGM metrics in Prometheus format.
  dcgm-exporter:
    image: nvidia/dcgm-exporter:4.2.3-4.1.3-ubi9
    ports:
      # Expose dcgm-exporter on port 9401 both inside and outside the container
      # to avoid conflicts with other dcgm-exporter instances in distributed environments.
      # To access DCGM metrics:
      # Outside the container: curl http://localhost:9401/metrics (or the host IP)
      # Inside the container (container-to-container): curl http://dcgm-exporter:9401/metrics
      - 9401:9401
    cap_add:
      - SYS_ADMIN
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    environment:
      # dcgm uses NVIDIA_VISIBLE_DEVICES variable but normally it is CUDA_VISIBLE_DEVICES
      - NVIDIA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-all}
      - DCGM_EXPORTER_LISTEN=:9401
    runtime: nvidia  # Specify the NVIDIA runtime
    networks:
      - monitoring

  # To access Prometheus from another machine, you may need to disable te firewall on your host. On Ubuntu:
  # sudo ufw allow 9090/tcp
  prometheus:
    image: prom/prometheus:v3.4.1
    container_name: prometheus
    volumes:
      - ./metrics/prometheus.yml:/etc/prometheus/prometheus.yml
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      # These provide the web console functionality
      - '--web.console.libraries=/etc/prometheus/console_libraries'
      - '--web.console.templates=/etc/prometheus/consoles'
      - '--web.enable-lifecycle'
    restart: unless-stopped
    # Example to pull from the /query endpoint:
    # {__name__=~"DCGM.*", job="dcgm-exporter"}
    networks:
      - monitoring
    ports:
      - "9090:9090"
    profiles: [metrics]
    extra_hosts:
    - "host.docker.internal:host-gateway"
    depends_on:
      - dcgm-exporter
      - nats-prometheus-exporter
      - etcd-server

  # grafana connects to prometheus via the /query endpoint.
  # Default credentials are dynamo/dynamo.
  # To access Grafana from another machine, you may need to disable te firewall on your host. On Ubuntu:
  # sudo ufw allow 3001/tcp
  grafana:
    image: grafana/grafana-enterprise:12.0.1
    container_name: grafana
    volumes:
      - ./metrics/grafana_dashboards:/etc/grafana/provisioning/dashboards
      - ./metrics/grafana-datasources.yml:/etc/grafana/provisioning/datasources/datasources.yml
    environment:
      - GF_SERVER_HTTP_PORT=3001
      # do not make it admin/admin, because you will be prompted to change the password every time
      - GF_SECURITY_ADMIN_USER=dynamo
      - GF_SECURITY_ADMIN_PASSWORD=dynamo
      - GF_USERS_ALLOW_SIGN_UP=false
      - GF_INSTALL_PLUGINS=grafana-piechart-panel
      # Default min interval is 5s, but can be configured lower
      - GF_DASHBOARDS_MIN_REFRESH_INTERVAL=2s
      # Disable password change requirement
      - GF_SECURITY_DISABLE_INITIAL_ADMIN_CREATION=false
      - GF_SECURITY_ADMIN_PASSWORD_POLICY=false
      - GF_AUTH_DISABLE_LOGIN_FORM=false
      - GF_AUTH_DISABLE_SIGNOUT_MENU=false
    restart: unless-stopped
    ports:
      - "3001:3001"
    networks:
      - monitoring
    profiles: [metrics]
    depends_on:
      - prometheus