Skip to content

Commit 51c73f3

Browse files
jinsoojinsoo
authored andcommitted
feat: distill dockerfile in pod
1 parent fab6dfc commit 51c73f3

File tree

8 files changed

+88
-74
lines changed

8 files changed

+88
-74
lines changed

.env.example

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
OPENAI_API_KEY=""
22
HF_API=""
3+
GEMINI_API_KEY=""
34
# OpenAI 429 시 vLLM(Qwen2.5-7B 등) 폴백. true + vllm 설치 시 429 발생하면 vLLM으로 재시도. 자세한 내용: etc_md/OPENAI_RATE_LIMIT_VLLM_FALLBACK.md
45
# ENABLE_VLLM_FALLBACK_ON_RATE_LIMIT=false
56
# vLLM 사용 시(429 폴백·기동 시 1차) RunPod Serverless GPU 사용. true면 요청 시 GPU 기동·유휴 시 스케일다운(비용 절감). 대상: RunPod. RUNPOD_API_KEY 필요.

Dockerfile.labeling-llm

Lines changed: 22 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,38 +1,29 @@
1-
# 라벨링 전용 이미지 (label_for_distill.py)
2-
# CPU 환경. LLM은 OpenAI API 또는 RunPod vLLM(VLLM_POD_BASE_URL) HTTP 호출.
1+
# RunPod Pod 전용 vLLM 이미지 (네트워크 볼륨 /workspace, OpenAI 호환 API)
2+
# - 이미지 내 /opt/llm-models 에 Qwen/Qwen2.5-7B-Instruct 포함.
3+
# - 엔트리포인트: /workspace에 모델 있으면 바로 vLLM / 없으면 이미지→/workspace 복사 후 vLLM / 복사 실패 시 다운로드(재시도).
34
#
4-
# 빌드: docker build -f Dockerfile.labeling-llm -t tasteam-labeling-llm .
5-
# 실행: docker run -v $(pwd)/distill_pipeline_output:/app/distill_pipeline_output \
6-
# -v $(pwd)/tasteam_app_all_review_data.json:/app/tasteam_app_all_review_data.json \
7-
# --env-file .env tasteam-labeling-llm
5+
# 빌드: docker build -f Dockerfile.runpod-pod-vllm -t tasteam-runpod-pod-vllm:latest .
6+
# (빌드 시 HuggingFace에서 모델 다운로드하므로 시간·용량 큼. HF_TOKEN 있으면 private 허용)
87
#
9-
# 환경변수: OPENAI_API_KEY, VLLM_POD_BASE_URL(teacher), LLM_PROVIDER 등
10-
11-
FROM python:3.11-slim
12-
13-
ENV PYTHONUNBUFFERED=1
14-
ENV PYTHONDONTWRITEBYTECODE=1
15-
16-
WORKDIR /app
17-
18-
# PyTorch CPU (LLMUtils import용, 실제 추론은 HTTP)
19-
RUN pip install --no-cache-dir --upgrade pip && \
20-
pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu
8+
# 실행 예 (로컬 테스트):
9+
# docker run --gpus all -p 8000:8000 --ipc=host \
10+
# -e HF_TOKEN=$HF_TOKEN -v /path/to/volume:/workspace \
11+
# tasteam-runpod-pod-vllm:latest
12+
#
13+
FROM vllm/vllm-openai:v0.11.0
2114

22-
COPY requirements.labeling-llm.txt /app/
23-
RUN pip install --no-cache-dir -r requirements.labeling-llm.txt
15+
ENV MODEL_NAME=/workspace/llm-models/Qwen/Qwen2.5-7B-Instruct
16+
ENV HF_MODEL_ID=Qwen/Qwen2.5-7B-Instruct
17+
# 이미지 내 모델 경로 (빌드 시 채워짐. 기동 시 /workspace 없으면 여기서 복사)
18+
ENV MODEL_IMAGE_PATH=/opt/llm-models/Qwen/Qwen2.5-7B-Instruct
2419

25-
COPY src /app/src
26-
COPY scripts/label_for_distill.py scripts/data_augmentation.py /app/scripts/
20+
RUN pip install -q huggingface_hub && \
21+
python3 -c "from huggingface_hub import snapshot_download; snapshot_download('Qwen/Qwen2.5-7B-Instruct', local_dir='/opt/llm-models/Qwen/Qwen2.5-7B-Instruct', local_dir_use_symlinks=False)"
2722

28-
ENV PYTHONPATH=/app
23+
COPY scripts/runpod_pod_entrypoint.sh /entrypoint.sh
24+
RUN chmod +x /entrypoint.sh
2925

30-
# 실행: --train-path, --output-dir 등 인자 전달 필요
31-
# 예: docker run ... tasteam-labeling-llm --train-path /app/datasets/xxx/train.json --output-dir /app/labeled/xxx --openai-cap 500
32-
ENTRYPOINT ["python", "scripts/label_for_distill.py"]
33-
CMD ["--help"]
26+
ENTRYPOINT ["/entrypoint.sh"]
27+
CMD ["--model", "/workspace/llm-models/Qwen/Qwen2.5-7B-Instruct", "--max-model-len", "4096", "--tensor-parallel-size", "1", "--gpu-memory-utilization", "0.90"]
3428

35-
# docker build -f Dockerfile.labeling-llm -t tasteam-labeling-llm .
36-
# docker run -v $(pwd)/distill_pipeline_output:/app/distill_pipeline_output --env-file .env tasteam-labeling-llm \
37-
# --train-path /app/distill_pipeline_output/datasets/xxx/train.json \
38-
# --output-dir /app/distill_pipeline_output/labeled/xxx --openai-cap 500
29+
EXPOSE 8000

Dockerfile.train-llm

Lines changed: 33 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,38 @@
11
# 학습 전용 이미지 (scripts/train_qlora.py)
22
# GPU 필수. QLoRA SFT, wandb.
3+
# 모델: 빌드 시 /opt에 다운로드, 기동 시 /workspace 없으면 복사 후 train_qlora 실행.
34
#
45
# 빌드: docker build -f Dockerfile.train-llm -t tasteam-train-llm .
5-
# 실행: docker run --gpus all \
6-
# -v $(pwd)/distill_pipeline_output:/app/distill_pipeline_output \
7-
# --env-file .env tasteam-train-llm --labeled-path /app/distill_pipeline_output/labeled/xxx/train_labeled.json --output-dir /app/distill_pipeline_output
86
#
9-
# 환경변수: WANDB_API_KEY, HF_HOME 등
7+
# 로컬 실행:
8+
# docker run --gpus all \
9+
# -v $(pwd)/distill_pipeline_output:/app/distill_pipeline_output \
10+
# --env-file .env tasteam-train-llm \
11+
# --labeled-path /app/distill_pipeline_output/labeled/YYYYMMDD_HHMMSS/train_labeled.json \
12+
# --output-dir /app/distill_pipeline_output
13+
#
14+
# RunPod Pod (Network Volume, 학습 결과 저장):
15+
# -v /path/to/network/volume:/workspace \
16+
# -e HF_HOME=/workspace/hf-cache -e HF_HUB_CACHE=/workspace/hf-cache \
17+
# tasteam-train-llm \
18+
# --labeled-path /workspace/distill_pipeline_output/labeled/YYYYMMDD_HHMMSS/train_labeled.json \
19+
# --output-dir /workspace/distill_pipeline_output
20+
# (adapter → /workspace/distill_pipeline_output/runs/YYYYMMDD_HHMMSS/adapter)
21+
#
22+
# 환경변수: WANDB_API_KEY, HF_HOME(선택), HF_HUB_CACHE(선택)
23+
24+
FROM runpod/pytorch:2.4.0-py3.11-cuda12.4.1-devel-ubuntu22.04
1025

11-
FROM pytorch/pytorch:2.1.0-cuda12.1-cudnn8-runtime
26+
ENV MODEL_NAME=/workspace/llm-models/Qwen/Qwen2.5-0.5B-Instruct
27+
ENV HF_MODEL_ID=Qwen/Qwen2.5-0.5B-Instruct
28+
# 이미지 내 모델 경로 (빌드 시 채워짐. 기동 시 /workspace 없으면 여기서 복사)
29+
ENV MODEL_IMAGE_PATH=/opt/llm-models/Qwen/Qwen2.5-0.5B-Instruct
30+
31+
RUN pip install -q huggingface_hub && \
32+
python3 -c "from huggingface_hub import snapshot_download; snapshot_download('Qwen/Qwen2.5-0.5B-Instruct', local_dir='/opt/llm-models/Qwen/Qwen2.5-0.5B-Instruct', local_dir_use_symlinks=False)"
33+
34+
COPY scripts/runpod_pod_entrypoint_05b.sh /entrypoint.sh
35+
RUN chmod +x /entrypoint.sh
1236

1337
ENV PYTHONUNBUFFERED=1
1438
ENV PYTHONDONTWRITEBYTECODE=1
@@ -19,14 +43,11 @@ COPY requirements.train-llm.txt /app/
1943
RUN pip install --no-cache-dir --upgrade pip && \
2044
pip install --no-cache-dir -r requirements.train-llm.txt
2145

46+
# train_qlora.py 단독 사용. src/ import 시 COPY 추가 필요.
2247
COPY scripts/train_qlora.py /app/scripts/
2348

2449
ENV PYTHONPATH=/app
2550

26-
ENTRYPOINT ["python", "scripts/train_qlora.py"]
27-
CMD ["--help"]
28-
29-
# docker build -f Dockerfile.train-llm -t tasteam-train-llm .
30-
# docker run --gpus all -v $(pwd)/distill_pipeline_output:/app/distill_pipeline_output --env-file .env tasteam-train-llm \
31-
# --labeled-path /app/distill_pipeline_output/labeled/xxx/train_labeled.json \
32-
# --output-dir /app/distill_pipeline_output
51+
ENTRYPOINT ["/entrypoint.sh"]
52+
# 기본: /workspace (RunPod 네트워크 볼륨). 로컬 실행 시 --output-dir /app/distill_pipeline_output 오버라이드.
53+
CMD ["--output-dir", "/workspace/distill_pipeline_output"]

Dockerfile.vllm-pod

Lines changed: 0 additions & 20 deletions
This file was deleted.

requirements.train-llm.txt

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,17 +4,17 @@
44
# 사용: pip install -r requirements.train-llm.txt
55
# Docker: Dockerfile.train-llm (PyTorch CUDA 베이스)
66

7-
# LLM / 학습
8-
transformers>=4.30.0
7+
# LLM / 학습 (버전 고정: transformers/peft/trl 호환성)
8+
transformers==4.44.2
99
accelerate>=0.20.0
10-
peft>=0.10.0
10+
peft==0.13.2
1111
bitsandbytes>=0.43.0
12-
trl>=0.12.0
12+
trl==0.12.0
1313
datasets>=2.18.0
1414
huggingface_hub>=0.20.0
1515

16-
# 오케스트레이션
17-
prefect>=2.0.0
16+
# prefect: WANDB_RUN_ID 전달용, train_qlora는 import 안 함. 선택 사항(이미지 절약 시 제거 가능)
17+
# prefect>=2.0.0
1818

1919
# 유틸
2020
python-dotenv>=1.0.0

scripts/runpod_cli/pod_create_delete_cli.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ def get_default_pod_payload() -> Dict[str, Any]:
8484
"dataCenterPriority": "availability",
8585
"dockerEntrypoint": [],
8686
"dockerStartCmd": [],
87-
"env": {"ENV_VAR": "value"},
87+
"env": {"ENV_VAR": "value",**({"WANDB_API_KEY": os.environ["WANDB_API_KEY"]} if os.environ.get("WANDB_API_KEY") else {}),},
8888
"globalNetworking": False,
8989
"gpuCount": 1,
9090
"gpuTypeIds": ["NVIDIA GeForce RTX 4090"],
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#!/usr/bin/env bash
2+
# RunPod Pod 학습용: /workspace에 모델 있으면 바로 train_qlora / 없으면 이미지→/workspace 복사 후 train_qlora.
3+
4+
set -e
5+
6+
MODEL_PATH="${MODEL_NAME:-/workspace/llm-models/Qwen/Qwen2.5-0.5B-Instruct}"
7+
MODEL_IMAGE_PATH="${MODEL_IMAGE_PATH:-/opt/llm-models/Qwen/Qwen2.5-0.5B-Instruct}"
8+
9+
if [[ -f "${MODEL_PATH}/config.json" ]]; then
10+
echo "[entrypoint] Model present at ${MODEL_PATH}, starting Qwen2.5-0.5B-Instruct."
11+
elif [[ -f "${MODEL_IMAGE_PATH}/config.json" ]]; then
12+
echo "[entrypoint] Copying model from image ${MODEL_IMAGE_PATH} to ${MODEL_PATH}..."
13+
mkdir -p "$(dirname "$MODEL_PATH")"
14+
cp -a "${MODEL_IMAGE_PATH}" "$(dirname "$MODEL_PATH")/"
15+
echo "[entrypoint] Copy done, starting Qwen2.5-0.5B-Instruct."
16+
else
17+
echo "[entrypoint] ERROR: Model not found at ${MODEL_PATH} nor in image at ${MODEL_IMAGE_PATH}." >&2
18+
exit 1
19+
fi
20+
21+
exec python /app/scripts/train_qlora.py --student-model "${MODEL_PATH}" "$@"

src/config.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -74,21 +74,21 @@ class _InferenceConfig:
7474
ENABLE_OPENAI_FALLBACK: bool = os.getenv("ENABLE_OPENAI_FALLBACK", "false").lower() == "true"
7575
# 벤더 API 페일오버 (failover_router_strategy.md): OpenAI 429/5xx 시 Gemini 등 다른 벤더로 전환
7676
GEMINI_API_KEY: Optional[str] = os.getenv("GEMINI_API_KEY")
77-
GEMINI_MODEL: str = os.getenv("GEMINI_MODEL", "gemini-1.5-flash")
77+
GEMINI_MODEL: str = os.getenv("GEMINI_MODEL", "gemini-2.5-flash")
7878
LLM_FAILOVER_ENABLED: bool = bool(os.getenv("GEMINI_API_KEY", "").strip())
7979
# [Deprecated] OpenAI 429 시 self-hosted vLLM 폴백 → 현재는 벤더 API(LLMFailoverRouter) 페일오버로 대체됨
8080
ENABLE_VLLM_FALLBACK_ON_RATE_LIMIT: bool = os.getenv("ENABLE_VLLM_FALLBACK_ON_RATE_LIMIT", "false").lower() == "true"
8181
# [Deprecated] vLLM 폴백 시 RunPod Serverless 사용 → 벤더 API 페일오버(GEMINI_API_KEY)로 대체됨
8282
VLLM_USE_RUNPOD_GPU: bool = os.getenv("VLLM_USE_RUNPOD_GPU", "false").lower() == "true"
8383
# RunPod vLLM 전용 엔드포인트 ID. 미설정 시 RUNPOD_ENDPOINT_ID 사용.
84-
RUNPOD_VLLM_ENDPOINT_ID: Optional[str] = (os.getenv("RUNPOD_VLLM_ENDPOINT_ID", "2mpd5y6lvccfk1") or "").strip() or None
84+
#RUNPOD_VLLM_ENDPOINT_ID: Optional[str] = (os.getenv("RUNPOD_VLLM_ENDPOINT_ID", "2mpd5y6lvccfk1") or "").strip() or None
8585
RUNPOD_API_KEY: Optional[str] = os.getenv("RUNPOD_API_KEY")
86-
RUNPOD_ENDPOINT_ID: str = (os.getenv("RUNPOD_ENDPOINT_ID", "2mpd5y6lvccfk1") or "").strip() or "2mpd5y6lvccfk1"
86+
#RUNPOD_ENDPOINT_ID: str = (os.getenv("RUNPOD_ENDPOINT_ID", "2mpd5y6lvccfk1") or "").strip() or "2mpd5y6lvccfk1"
8787
USE_RUNPOD: bool = os.getenv("USE_RUNPOD", "true").lower() == "true"
8888
RUNPOD_POLL_INTERVAL: int = int(os.getenv("RUNPOD_POLL_INTERVAL", "2"))
8989
RUNPOD_MAX_WAIT_TIME: int = int(os.getenv("RUNPOD_MAX_WAIT_TIME", "300"))
9090
# RunPod Serverless vLLM 엔드포인트 사용 (앱 내 인프로세스 vLLM 제거됨)
91-
USE_POD_VLLM: bool = os.getenv("USE_POD_VLLM", "true").lower() == "true"
91+
#USE_POD_VLLM: bool = os.getenv("USE_POD_VLLM", "true").lower() == "true"
9292
# RunPod Pod 직접 URL (vLLM OpenAI 호환 /v1). 설정 시 Serverless 대신 이 URL로 추론. 기본값: 213.173.108.29:16366 (test_all_task 연동)
9393
VLLM_POD_BASE_URL: Optional[str] = (os.getenv("VLLM_POD_BASE_URL", "http://213.173.108.70:17517/v1") or "").strip() or None
9494

0 commit comments

Comments
 (0)