forked from QwenLM/Qwen3-TTS
-
Notifications
You must be signed in to change notification settings - Fork 64
Expand file tree
/
Copy pathDockerfile.rocm
More file actions
63 lines (51 loc) · 1.99 KB
/
Copy pathDockerfile.rocm
File metadata and controls
63 lines (51 loc) · 1.99 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# Qwen3-TTS OpenAI-Compatible API Server
# ROCm/AMD-focused image for the optimized backend
FROM rocm/pytorch:rocm6.3.1_ubuntu22.04_py3.12_pytorch_release_2.6.0
ENV PYTHONUNBUFFERED=1
ENV PYTHONDONTWRITEBYTECODE=1
ENV NUMBA_CACHE_DIR=/tmp/numba_cache
WORKDIR /app
# System dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
ffmpeg \
libsndfile1 \
git \
curl \
&& rm -rf /var/lib/apt/lists/*
# Build dependencies for flash-attn
RUN pip install --no-cache-dir --upgrade pip setuptools wheel ninja packaging
# Install flash attention from source for ROCm/Triton
ARG FLASH_ATTN_REF=v2.8.3
RUN cd /tmp && \
git clone --depth 1 --branch "${FLASH_ATTN_REF}" https://github.com/Dao-AILab/flash-attention.git && \
cd flash-attention && \
FLASH_ATTENTION_TRITON_AMD_ENABLE=TRUE pip install . --no-build-isolation && \
cd / && rm -rf /tmp/flash-attention
# Install the package and API dependencies from the project metadata.
# PyYAML is included for the optimized backend's YAML config loading.
COPY . .
RUN pip install --no-cache-dir -e ".[api]"
# Default optimized-backend config location
RUN mkdir -p /root/qwen3-tts/voice_library \
&& cp config.yaml /root/qwen3-tts/config.yaml
# AMD ROCm runtime tuning derived from the analyzed fork.
ENV FLASH_ATTENTION_TRITON_AMD_ENABLE=TRUE \
MIOPEN_FIND_MODE=FAST \
TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1 \
TORCH_BLAS_PREFER_HIPBLASLT=1 \
PYTORCH_TUNABLEOP_ENABLED=1 \
PYTORCH_TUNABLEOP_FILENAME=/root/qwen3-tts/tunableop_results.csv \
GPU_MAX_ALLOC_PERCENT=100 \
GPU_MAX_HEAP_SIZE=100 \
GPU_MAX_HW_QUEUES=1 \
GPU_KEEPALIVE_INTERVAL=15 \
TTS_BACKEND=optimized \
TTS_CONFIG=/root/qwen3-tts/config.yaml \
VOICE_LIBRARY_DIR=/root/qwen3-tts/voice_library \
HOST=0.0.0.0 \
PORT=8880 \
WORKERS=1
EXPOSE 8880
HEALTHCHECK --interval=30s --timeout=10s --start-period=120s --retries=3 \
CMD curl -f http://localhost:8880/health || exit 1
CMD ["python", "-m", "api.main"]