nemotron-voice-agent/docker-compose.jetson.yml at main · NVIDIA-AI-Blueprints/nemotron-voice-agent · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# SPDX-FileCopyrightText: Copyright (c) 2024–2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-2-Clause

name: nemotron-voice-agent-jetson

volumes:
  nim_cache:

services:
  # =============================================================================
  # LLM Service
  # =============================================================================

  llm-nvidia-jetson:
    runtime: nvidia
    image: nvcr.io/nvidia/vllm:25.10-py3
    container_name: llm-nvidia-jetson
    env_file:
      - .env
    environment:
      - HF_TOKEN=${HF_TOKEN}
    ports:
      - "9000:8000"
    volumes:
      - nim_cache:/root/.cache
    shm_size: 16GB
    restart: unless-stopped
    entrypoint: ["/bin/bash", "-c"]
    command:
      - |
        python3 -m vllm.entrypoints.openai.api_server \
          --model "$${NVIDIA_LLM_MODEL:-RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16}" \
          --host 0.0.0.0 \
          --port 8000 \
          --max-model-len 4096 \
          --gpu-memory-utilization "$${GPU_MEMORY_UTILIZATION:-0.15}" \
          --enforce-eager \
          --trust-remote-code
    healthcheck:
      test: ["CMD-SHELL", "curl -f http://localhost:8000/health || exit 1"]
      interval: 30s
      timeout: 15s
      retries: 10
      start_period: 1200s
    logging:
      driver: "json-file"
      options:
        max-size: "50m"
        max-file: "5"

  # =============================================================================
  # Voice Agent Application
  # =============================================================================

  python-app:
    build:
      context: .
      dockerfile: Dockerfile
    image: nvcr.io/nvidia/blueprint/nemotron-voice-agent:${IMAGE_VERSION:-dev}
    container_name: voice-agent-webrtc-jetson
    network_mode: host
    env_file:
      - .env
    environment:
      - NVIDIA_API_KEY=${NVIDIA_API_KEY}
    command: >
      bash -c "
      if [ \"$${TRANSPORT:-WEBRTC}\" = \"WEBSOCKET\" ]; then
        echo \"Starting WebSocket pipeline...\"
        uv run src/pipeline_websocket.py --host 0.0.0.0 --port 7860 --workers $${WORKERS:-1}
      else
        echo \"Starting WebRTC pipeline...\"
        uv run src/pipeline.py --host 0.0.0.0 --port 7860 --workers $${WORKERS:-1}
      fi
      "
    volumes:
      - ./audio_dumps:/app/audio_dumps
      - ./config:/app/config
    restart: unless-stopped
    healthcheck:
      test: ["CMD-SHELL", "curl -s http://localhost:7860/docs || exit 1"]
      interval: 30s
      timeout: 10s
      retries: 3

  # =============================================================================
  # UI Application
  # =============================================================================

  ui-app:
    build:
      context: .
      dockerfile: frontend/Dockerfile
    image: nvcr.io/nvidia/blueprint/nemotron-voice-agent-ui:${IMAGE_VERSION:-dev}
    container_name: webrtc-ui-jetson
    ports:
      - "8081:8000"
    restart: unless-stopped