feat: manage runpod lifecycle for labeling task in prefect flow

jinsoo · jinsoo · commit d5f99f094078 · 2026-02-22T20:35:19.000+09:00
diff --git a/scripts/distill_flows.py b/scripts/distill_flows.py
@@ -17,6 +17,8 @@
 
 실행:
   python scripts/distill_flows.py build_dataset [--input path] [--out-dir dir]
+  python scripts/distill_flows.py labeling --train-path datasets/xxx/train.json
+  python scripts/distill_flows.py labeling_with_pod --train-path datasets/xxx/train.json  # Pod 생성→라벨링→삭제
   python scripts/distill_flows.py all
 """
 
@@ -33,8 +35,11 @@
 
 import argparse
 import json
+import logging
+import os
 import subprocess
 import sys
+import time
 from datetime import datetime
 from pathlib import Path
 
@@ -48,6 +53,16 @@
 _SCRIPT_DIR = Path(__file__).resolve().parent
 _PROJECT_ROOT = _SCRIPT_DIR.parent
 
+# RunPodClient import (scripts/runpod_cli)
+if str(_SCRIPT_DIR) not in sys.path:
+    sys.path.insert(0, str(_SCRIPT_DIR))
+try:
+    from runpod_cli.pod_create_delete_cli import RunPodClient
+except ImportError:
+    RunPodClient = None
+
+logger = logging.getLogger(__name__)
+
 
 @task(name="build-dataset-task", log_prints=True)
 def build_dataset_task(
@@ -185,6 +200,123 @@ def labeling_flow(
     )
 
 
+def _wait_for_vllm_ready(base_url: str, timeout_sec: int = 180, poll_interval: int = 10) -> None:
+    """vLLM /v1/models 가 응답할 때까지 대기."""
+    import requests
+    url = base_url.rstrip("/") + "/v1/models"
+    deadline = time.time() + timeout_sec
+    while time.time() < deadline:
+        try:
+            r = requests.get(url, timeout=10)
+            if r.status_code == 200:
+                logger.info("vLLM ready: %s", url)
+                return
+        except Exception as e:
+            logger.debug("vLLM not ready yet: %s", e)
+        time.sleep(poll_interval)
+    raise TimeoutError(f"vLLM at {base_url} did not become ready within {timeout_sec}s")
+
+
+@task(name="labeling-with-pod-task", log_prints=True)
+def labeling_with_pod_task(
+    train_path: str,
+    val_path: str | None = None,
+    test_path: str | None = None,
+    openai_cap: int = 500,
+    output_labeled_dir: str | None = None,
+    pod_wait_timeout_sec: int = 600,
+    vllm_ready_timeout_sec: int = 180,
+) -> dict:
+    """
+    Pod 생성 → vLLM 준비 대기 → 라벨링 → Pod 삭제.
+    RUNPOD_API_KEY 필요. self-hosted teacher용 vLLM Pod를 생성 후 label_for_distill 실행.
+    """
+    if RunPodClient is None:
+        raise RuntimeError("RunPodClient not available. Check runpod_cli import.")
+    token = os.environ.get("RUNPOD_API_KEY")
+    if not token:
+        raise ValueError("RUNPOD_API_KEY environment variable is required for labeling_with_pod")
+
+    client = RunPodClient(token=token)
+    payload = RunPodClient.get_default_pod_payload()
+    pod = client.create_pod(payload)
+    pod_id = pod["id"]
+    print("Pod created:", pod_id)
+
+    try:
+        ready = client.wait_until_running(pod_id, timeout_sec=pod_wait_timeout_sec)
+        public_ip = ready.get("publicIp")
+        if not public_ip:
+            raise RuntimeError(f"Pod {pod_id} has no publicIp. Response: {ready}")
+
+        base_url = f"http://{public_ip}:8000/v1"
+        print("Pod ready:", pod_id, "base_url:", base_url)
+
+        _wait_for_vllm_ready(base_url, timeout_sec=vllm_ready_timeout_sec)
+
+        env = os.environ.copy()
+        env["VLLM_POD_BASE_URL"] = base_url
+        env["USE_POD_VLLM"] = "true"
+        env["LLM_PROVIDER"] = "runpod"
+
+        out_dir = Path(output_labeled_dir or _PROJECT_ROOT / "distill_pipeline_output")
+        version = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
+        labeled_dir = out_dir / "labeled" / version
+        labeled_dir.mkdir(parents=True, exist_ok=True)
+
+        cmd = [
+            sys.executable,
+            str(_SCRIPT_DIR / "label_for_distill.py"),
+            "--train-path", str(train_path),
+            "--openai-cap", str(openai_cap),
+            "--output-dir", str(labeled_dir),
+        ]
+        if val_path and Path(val_path).exists():
+            cmd.extend(["--val-path", str(val_path)])
+        if test_path and Path(test_path).exists():
+            cmd.extend(["--test-path", str(test_path)])
+
+        result = subprocess.run(cmd, cwd=str(_PROJECT_ROOT), env=env, capture_output=False)
+        if result.returncode != 0:
+            raise RuntimeError(f"label_for_distill.py exited with {result.returncode}")
+
+        labeled_path = labeled_dir / "train_labeled.json"
+        out = {"labeled_version": version, "labeled_path": str(labeled_path)}
+        for name, fn in [("val_labeled_path", "val_labeled.json"), ("test_labeled_path", "test_labeled.json")]:
+            p = labeled_dir / fn
+            if p.exists():
+                out[name] = str(p)
+        return out
+    finally:
+        print("Cleaning up pod:", pod_id)
+        client.delete_pod(pod_id)
+
+
+@flow(name="labeling_with_pod_flow", log_prints=True)
+def labeling_with_pod_flow(
+    train_path: str,
+    val_path: str | None = None,
+    test_path: str | None = None,
+    openai_cap: int = 500,
+    output_labeled_dir: str | Path | None = None,
+    pod_wait_timeout_sec: int = 600,
+    vllm_ready_timeout_sec: int = 180,
+) -> dict:
+    """
+    Pod 생성 → 라벨링(OpenAI 골드 + vLLM teacher) → Pod 삭제.
+    docs/runpod_cli/cli_strategy.md: "Pod 생성 → 대기 → VLLM_POD_BASE_URL 설정 후 label_for_distill 실행 → 작업 완료 후 Pod 삭제"
+    """
+    return labeling_with_pod_task(
+        train_path=train_path,
+        val_path=val_path,
+        test_path=test_path,
+        openai_cap=openai_cap,
+        output_labeled_dir=str(output_labeled_dir) if output_labeled_dir else None,
+        pod_wait_timeout_sec=pod_wait_timeout_sec,
+        vllm_ready_timeout_sec=vllm_ready_timeout_sec,
+    )
+
+
 @task(name="train-student-task", log_prints=True)
 def train_student_task(
     labeled_path: str,
@@ -354,8 +486,8 @@ def main() -> None:
     parser = argparse.ArgumentParser(description="Prefect flows for summary KD pipeline (distill_by_prefect.md)")
     parser.add_argument(
         "flow",
-        choices=["build_dataset", "labeling", "train_student", "evaluate", "all"],
-        help="Flow to run",
+        choices=["build_dataset", "labeling", "labeling_with_pod", "train_student", "evaluate", "all"],
+        help="Flow to run (labeling_with_pod: Pod 생성→라벨링→삭제)",
     )
     parser.add_argument("--input", type=Path, default=None, help="Input reviews JSON (default: tasteam_app_all_review_data.json)")
     parser.add_argument("--out-dir", type=Path, default=None, help="Output root (default: distill_pipeline_output)")
@@ -404,6 +536,35 @@ def main() -> None:
             output_labeled_dir=out_dir,
         )
         print("Result:", result)
+    elif args.flow == "labeling_with_pod":
+        if not args.train_path:
+            parser.error("labeling_with_pod requires --train-path")
+        ds_dir = out_dir / "datasets"
+        val_p, test_p = None, None
+        if args.val_path:
+            val_p = str(args.val_path)
+        elif ds_dir.exists():
+            for d in sorted(ds_dir.iterdir(), reverse=True):
+                v = d / "val.json"
+                if v.exists():
+                    val_p = str(v)
+                    break
+        if args.test_path:
+            test_p = str(args.test_path)
+        elif ds_dir.exists():
+            for d in sorted(ds_dir.iterdir(), reverse=True):
+                t = d / "test.json"
+                if t.exists():
+                    test_p = str(t)
+                    break
+        result = labeling_with_pod_flow(
+            train_path=str(args.train_path),
+            val_path=val_p,
+            test_path=test_p,
+            openai_cap=args.openai_cap,
+            output_labeled_dir=out_dir,
+        )
+        print("Result:", result)
     elif args.flow == "train_student":
         if not args.labeled_path:
             parser.error("train_student requires --labeled-path")
diff --git a/scripts/runpod_cli/pod_create_delete_cli.py b/scripts/runpod_cli/pod_create_delete_cli.py
@@ -66,6 +66,46 @@ def wait_until_running(
 
         raise TimeoutError(f"Pod {pod_id} did not reach RUNNING within {timeout_sec}s. Last: {last}")
 
+    @staticmethod
+    def get_default_pod_payload() -> Dict[str, Any]:
+        """vLLM Pod 생성용 기본 payload (distill 라벨링 등에서 재사용)."""
+        return {
+            "allowedCudaVersions": ["13.0"],
+            "cloudType": "SECURE",
+            "computeType": "GPU",
+            "containerDiskInGb": 50,
+            "cpuFlavorPriority": "availability",
+            "dataCenterIds": [
+                "EU-RO-1", "CA-MTL-1", "EU-SE-1", "US-IL-1", "EUR-IS-1", "EU-CZ-1", "US-TX-3", "EUR-IS-2",
+                "US-KS-2", "US-GA-2", "US-WA-1", "US-TX-1", "CA-MTL-3", "EU-NL-1", "US-TX-4", "US-CA-2",
+                "US-NC-1", "OC-AU-1", "US-DE-1", "EUR-IS-3", "CA-MTL-2", "AP-JP-1", "EUR-NO-1", "EU-FR-1",
+                "US-KS-3", "US-GA-1",
+            ],
+            "dataCenterPriority": "availability",
+            "dockerEntrypoint": [],
+            "dockerStartCmd": [],
+            "env": {"ENV_VAR": "value"},
+            "globalNetworking": False,
+            "gpuCount": 1,
+            "gpuTypeIds": ["NVIDIA GeForce RTX 4090"],
+            "gpuTypePriority": "availability",
+            "imageName": "jinsoo1218/vllm-pod:latest",
+            "interruptible": False,
+            "locked": False,
+            "minDiskBandwidthMBps": 123,
+            "minDownloadMbps": 123,
+            "minRAMPerGPU": 8,
+            "minUploadMbps": 123,
+            "minVCPUPerGPU": 2,
+            "name": "vllm-pod",
+            "networkVolumeId": "2kn4qj6rql",
+            "ports": ["8000/http", "22/tcp"],
+            "supportPublicIp": True,
+            "vcpuCount": 2,
+            "volumeInGb": 20,
+            "volumeMountPath": "/workspace",
+        }
+
     def _handle_json_response(self, resp: requests.Response) -> Dict[str, Any]:
         # 에러 메시지를 보기 좋게
         try:
@@ -82,45 +122,9 @@ def _handle_json_response(self, resp: requests.Response) -> Dict[str, Any]:
 
 
 if __name__ == "__main__":
-    token = os.environ["RUNPOD_API_KEY"]  # 너가 쓰는 키 이름으로 통일
+    token = os.environ["RUNPOD_API_KEY"]
     client = RunPodClient(token)
-
-    payload = {
-        "allowedCudaVersions": ["13.0"],
-        "cloudType": "SECURE",
-        "computeType": "GPU",
-        "containerDiskInGb": 50,
-        "cpuFlavorPriority": "availability",
-        "dataCenterIds": [
-            "EU-RO-1","CA-MTL-1","EU-SE-1","US-IL-1","EUR-IS-1","EU-CZ-1","US-TX-3","EUR-IS-2",
-            "US-KS-2","US-GA-2","US-WA-1","US-TX-1","CA-MTL-3","EU-NL-1","US-TX-4","US-CA-2",
-            "US-NC-1","OC-AU-1","US-DE-1","EUR-IS-3","CA-MTL-2","AP-JP-1","EUR-NO-1","EU-FR-1",
-            "US-KS-3","US-GA-1"
-        ],
-        "dataCenterPriority": "availability",
-        "dockerEntrypoint": [],
-        "dockerStartCmd": [],
-        "env": {"ENV_VAR": "value"},
-        "globalNetworking": False,
-        "gpuCount": 1,
-        "gpuTypeIds": ["NVIDIA GeForce RTX 4090"],
-        "gpuTypePriority": "availability",
-        "imageName": "jinsoo1218/vllm-pod:latest",
-        "interruptible": False,
-        "locked": False,
-        "minDiskBandwidthMBps": 123,
-        "minDownloadMbps": 123,
-        "minRAMPerGPU": 8,
-        "minUploadMbps": 123,
-        "minVCPUPerGPU": 2,
-        "name": "vllm-pod",
-        "networkVolumeId": "2kn4qj6rql",
-        "ports": ["8000/http", "22/tcp"],
-        "supportPublicIp": True,
-        "vcpuCount": 2,
-        "volumeInGb": 20,
-        "volumeMountPath": "/workspace",
-    }
+    payload = RunPodClient.get_default_pod_payload()
 
     pod = client.create_pod(payload)
     pod_id = pod["id"]