SmarterRouter/docs/docker-compose.amd.yml at main · peva3/SmarterRouter · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# Docker Compose for AMD GPUs (ROCm)
#
# Requirements:
#   - AMD GPU with ROCm support:
#     - Radeon RX 6000/7000 series (discrete)
#     - Radeon Instinct (MI series)
#     - Radeon Pro series
#     - AMD APUs (Ryzen AI 300 series, Ryzen 8000G)
#   - ROCm runtime installed: https://rocm.docs.amd.com/en/latest/deploy/linux/index.html
#   - Verify: rocm-smi should work on host (optional, sysfs fallback available)
#
# Quick Start:
#   1. Copy this file to your project root: cp docs/docker-compose.amd.yml docker-compose.yml
#   2. Copy environment template: cp ENV_DEFAULT .env
#   3. Get group IDs and add to .env:
#        echo "RENDER_GID=$(getent group render | cut -d: -f3)" >> .env
#        echo "VIDEO_GID=$(getent group video | cut -d: -f3)" >> .env
#   4. Run: docker-compose up -d
#
# Verify GPU access:
#   docker exec smarterrouter ls /sys/class/drm
#   docker logs smarterrouter | grep -i "amd\|gpu"
#
# Troubleshooting:
#   - If GPU not detected, check: ls /sys/class/drm/card*/device/mem_info_vram_total
#   - Ensure user is in render/video groups: groups $USER
#   - For compute workloads, you may need a ROCm base image (see below)
#
# APU (Unified Memory) Setup:
#   AMD APUs like Ryzen AI 300 series use unified memory (CPU+GPU share RAM).
#   The router auto-detects APUs and uses GTT pool instead of VRAM carve-out.
#   - BIOS setting: Set UMA Frame Buffer to MINIMUM (512MB-2GB), NOT maximum
#   - If auto-detection fails, set: ROUTER_AMD_UNIFIED_MEMORY_GB=58 (for 64GB RAM)
#
# ROCm Base Image Option:
#   For full ROCm compute support, build from ROCm base:
#   ```dockerfile
#   FROM rocm/pytorch:rocm6.0_ubuntu22.04_py3.9_pytorch_2.0.1
#   COPY . /app
#   WORKDIR /app
#   RUN pip install -r requirements.txt
#   CMD ["python", "-m", "uvicorn", "main:app", "--host", "0.0.0.0", "--port", "11436"]
#   ```

services:
  smarterrouter:
    image: ghcr.io/peva3/smarterrouter:latest
    container_name: smarterrouter
    ports:
      - "11436:11436"
    env_file:
      - .env
    volumes:
      - ./data:/app/data:rw
      - type: tmpfs
        target: /tmp
    restart: unless-stopped
    networks:
      - smarterrouter-network
    healthcheck:
      test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:11436/health')"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 10s
    # AMD GPU device passthrough
    devices:
      - /dev/kfd                  # AMD Kernel Fusion Driver (required for ROCm)
      - /dev/dri                  # Direct Rendering Infrastructure (required for GPU access)

    # AMD GPU group permissions (required for non-root access)
    # Get the group IDs on your host:
    #   RENDER_GID=$(getent group render | cut -d: -f3)
    #   VIDEO_GID=$(getent group video | cut -d: -f3)
    # Then set in .env or uncomment and hardcode:
    group_add:
      - ${RENDER_GID:-109}        # render group ID (default 109 on Ubuntu)
      - ${VIDEO_GID:-44}          # video group ID (default 44 on Ubuntu)

    # Optional: ROCm environment
    # environment:
    #   - ROCM_PATH=/opt/rocm
    #   - HIP_VISIBLE_DEVICES=0     # Limit to specific GPU (0, 1, etc.)
    #   - ROUTER_VRAM_MAX_TOTAL_GB=16
    #   - ROUTER_AMD_UNIFIED_MEMORY_GB=58  # For APUs: override unified memory (optional)

networks:
  smarterrouter-network:
    driver: bridge