Qwen3-TTS-Openai-Fastapi/Dockerfile.rocm at main · groxaxo/Qwen3-TTS-Openai-Fastapi · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# Qwen3-TTS OpenAI-Compatible API Server
# ROCm/AMD-focused image for the optimized backend

FROM rocm/pytorch:rocm6.3.1_ubuntu22.04_py3.12_pytorch_release_2.6.0

ENV PYTHONUNBUFFERED=1
ENV PYTHONDONTWRITEBYTECODE=1
ENV NUMBA_CACHE_DIR=/tmp/numba_cache

WORKDIR /app

# System dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
    ffmpeg \
    libsndfile1 \
    git \
    curl \
    && rm -rf /var/lib/apt/lists/*

# Build dependencies for flash-attn
RUN pip install --no-cache-dir --upgrade pip setuptools wheel ninja packaging

# Install flash attention from source for ROCm/Triton
ARG FLASH_ATTN_REF=v2.8.3
RUN cd /tmp && \
    git clone --depth 1 --branch "${FLASH_ATTN_REF}" https://github.com/Dao-AILab/flash-attention.git && \
    cd flash-attention && \
    FLASH_ATTENTION_TRITON_AMD_ENABLE=TRUE pip install . --no-build-isolation && \
    cd / && rm -rf /tmp/flash-attention

# Install the package and API dependencies from the project metadata.
# PyYAML is included for the optimized backend's YAML config loading.
COPY . .
RUN pip install --no-cache-dir -e ".[api]"

# Default optimized-backend config location
RUN mkdir -p /root/qwen3-tts/voice_library \
    && cp config.yaml /root/qwen3-tts/config.yaml

# AMD ROCm runtime tuning derived from the analyzed fork.
ENV FLASH_ATTENTION_TRITON_AMD_ENABLE=TRUE \
    MIOPEN_FIND_MODE=FAST \
    TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1 \
    TORCH_BLAS_PREFER_HIPBLASLT=1 \
    PYTORCH_TUNABLEOP_ENABLED=1 \
    PYTORCH_TUNABLEOP_FILENAME=/root/qwen3-tts/tunableop_results.csv \
    GPU_MAX_ALLOC_PERCENT=100 \
    GPU_MAX_HEAP_SIZE=100 \
    GPU_MAX_HW_QUEUES=1 \
    GPU_KEEPALIVE_INTERVAL=15 \
    TTS_BACKEND=optimized \
    TTS_CONFIG=/root/qwen3-tts/config.yaml \
    VOICE_LIBRARY_DIR=/root/qwen3-tts/voice_library \
    HOST=0.0.0.0 \
    PORT=8880 \
    WORKERS=1

EXPOSE 8880

HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=3 \
    CMD curl -f http://localhost:8880/health || exit 1

CMD ["python", "-m", "api.main"]