|
17 | 17 |
|
18 | 18 | 실행: |
19 | 19 | python scripts/distill_flows.py build_dataset [--input path] [--out-dir dir] |
| 20 | + python scripts/distill_flows.py labeling --train-path datasets/xxx/train.json |
| 21 | + python scripts/distill_flows.py labeling_with_pod --train-path datasets/xxx/train.json # Pod 생성→라벨링→삭제 |
20 | 22 | python scripts/distill_flows.py all |
21 | 23 | """ |
22 | 24 |
|
|
33 | 35 |
|
34 | 36 | import argparse |
35 | 37 | import json |
| 38 | +import logging |
| 39 | +import os |
36 | 40 | import subprocess |
37 | 41 | import sys |
| 42 | +import time |
38 | 43 | from datetime import datetime |
39 | 44 | from pathlib import Path |
40 | 45 |
|
|
48 | 53 | _SCRIPT_DIR = Path(__file__).resolve().parent |
49 | 54 | _PROJECT_ROOT = _SCRIPT_DIR.parent |
50 | 55 |
|
| 56 | +# RunPodClient import (scripts/runpod_cli) |
| 57 | +if str(_SCRIPT_DIR) not in sys.path: |
| 58 | + sys.path.insert(0, str(_SCRIPT_DIR)) |
| 59 | +try: |
| 60 | + from runpod_cli.pod_create_delete_cli import RunPodClient |
| 61 | +except ImportError: |
| 62 | + RunPodClient = None |
| 63 | + |
| 64 | +logger = logging.getLogger(__name__) |
| 65 | + |
51 | 66 |
|
52 | 67 | @task(name="build-dataset-task", log_prints=True) |
53 | 68 | def build_dataset_task( |
@@ -185,6 +200,123 @@ def labeling_flow( |
185 | 200 | ) |
186 | 201 |
|
187 | 202 |
|
| 203 | +def _wait_for_vllm_ready(base_url: str, timeout_sec: int = 180, poll_interval: int = 10) -> None: |
| 204 | + """vLLM /v1/models 가 응답할 때까지 대기.""" |
| 205 | + import requests |
| 206 | + url = base_url.rstrip("/") + "/v1/models" |
| 207 | + deadline = time.time() + timeout_sec |
| 208 | + while time.time() < deadline: |
| 209 | + try: |
| 210 | + r = requests.get(url, timeout=10) |
| 211 | + if r.status_code == 200: |
| 212 | + logger.info("vLLM ready: %s", url) |
| 213 | + return |
| 214 | + except Exception as e: |
| 215 | + logger.debug("vLLM not ready yet: %s", e) |
| 216 | + time.sleep(poll_interval) |
| 217 | + raise TimeoutError(f"vLLM at {base_url} did not become ready within {timeout_sec}s") |
| 218 | + |
| 219 | + |
| 220 | +@task(name="labeling-with-pod-task", log_prints=True) |
| 221 | +def labeling_with_pod_task( |
| 222 | + train_path: str, |
| 223 | + val_path: str | None = None, |
| 224 | + test_path: str | None = None, |
| 225 | + openai_cap: int = 500, |
| 226 | + output_labeled_dir: str | None = None, |
| 227 | + pod_wait_timeout_sec: int = 600, |
| 228 | + vllm_ready_timeout_sec: int = 180, |
| 229 | +) -> dict: |
| 230 | + """ |
| 231 | + Pod 생성 → vLLM 준비 대기 → 라벨링 → Pod 삭제. |
| 232 | + RUNPOD_API_KEY 필요. self-hosted teacher용 vLLM Pod를 생성 후 label_for_distill 실행. |
| 233 | + """ |
| 234 | + if RunPodClient is None: |
| 235 | + raise RuntimeError("RunPodClient not available. Check runpod_cli import.") |
| 236 | + token = os.environ.get("RUNPOD_API_KEY") |
| 237 | + if not token: |
| 238 | + raise ValueError("RUNPOD_API_KEY environment variable is required for labeling_with_pod") |
| 239 | + |
| 240 | + client = RunPodClient(token=token) |
| 241 | + payload = RunPodClient.get_default_pod_payload() |
| 242 | + pod = client.create_pod(payload) |
| 243 | + pod_id = pod["id"] |
| 244 | + print("Pod created:", pod_id) |
| 245 | + |
| 246 | + try: |
| 247 | + ready = client.wait_until_running(pod_id, timeout_sec=pod_wait_timeout_sec) |
| 248 | + public_ip = ready.get("publicIp") |
| 249 | + if not public_ip: |
| 250 | + raise RuntimeError(f"Pod {pod_id} has no publicIp. Response: {ready}") |
| 251 | + |
| 252 | + base_url = f"http://{public_ip}:8000/v1" |
| 253 | + print("Pod ready:", pod_id, "base_url:", base_url) |
| 254 | + |
| 255 | + _wait_for_vllm_ready(base_url, timeout_sec=vllm_ready_timeout_sec) |
| 256 | + |
| 257 | + env = os.environ.copy() |
| 258 | + env["VLLM_POD_BASE_URL"] = base_url |
| 259 | + env["USE_POD_VLLM"] = "true" |
| 260 | + env["LLM_PROVIDER"] = "runpod" |
| 261 | + |
| 262 | + out_dir = Path(output_labeled_dir or _PROJECT_ROOT / "distill_pipeline_output") |
| 263 | + version = datetime.utcnow().strftime("%Y%m%d_%H%M%S") |
| 264 | + labeled_dir = out_dir / "labeled" / version |
| 265 | + labeled_dir.mkdir(parents=True, exist_ok=True) |
| 266 | + |
| 267 | + cmd = [ |
| 268 | + sys.executable, |
| 269 | + str(_SCRIPT_DIR / "label_for_distill.py"), |
| 270 | + "--train-path", str(train_path), |
| 271 | + "--openai-cap", str(openai_cap), |
| 272 | + "--output-dir", str(labeled_dir), |
| 273 | + ] |
| 274 | + if val_path and Path(val_path).exists(): |
| 275 | + cmd.extend(["--val-path", str(val_path)]) |
| 276 | + if test_path and Path(test_path).exists(): |
| 277 | + cmd.extend(["--test-path", str(test_path)]) |
| 278 | + |
| 279 | + result = subprocess.run(cmd, cwd=str(_PROJECT_ROOT), env=env, capture_output=False) |
| 280 | + if result.returncode != 0: |
| 281 | + raise RuntimeError(f"label_for_distill.py exited with {result.returncode}") |
| 282 | + |
| 283 | + labeled_path = labeled_dir / "train_labeled.json" |
| 284 | + out = {"labeled_version": version, "labeled_path": str(labeled_path)} |
| 285 | + for name, fn in [("val_labeled_path", "val_labeled.json"), ("test_labeled_path", "test_labeled.json")]: |
| 286 | + p = labeled_dir / fn |
| 287 | + if p.exists(): |
| 288 | + out[name] = str(p) |
| 289 | + return out |
| 290 | + finally: |
| 291 | + print("Cleaning up pod:", pod_id) |
| 292 | + client.delete_pod(pod_id) |
| 293 | + |
| 294 | + |
| 295 | +@flow(name="labeling_with_pod_flow", log_prints=True) |
| 296 | +def labeling_with_pod_flow( |
| 297 | + train_path: str, |
| 298 | + val_path: str | None = None, |
| 299 | + test_path: str | None = None, |
| 300 | + openai_cap: int = 500, |
| 301 | + output_labeled_dir: str | Path | None = None, |
| 302 | + pod_wait_timeout_sec: int = 600, |
| 303 | + vllm_ready_timeout_sec: int = 180, |
| 304 | +) -> dict: |
| 305 | + """ |
| 306 | + Pod 생성 → 라벨링(OpenAI 골드 + vLLM teacher) → Pod 삭제. |
| 307 | + docs/runpod_cli/cli_strategy.md: "Pod 생성 → 대기 → VLLM_POD_BASE_URL 설정 후 label_for_distill 실행 → 작업 완료 후 Pod 삭제" |
| 308 | + """ |
| 309 | + return labeling_with_pod_task( |
| 310 | + train_path=train_path, |
| 311 | + val_path=val_path, |
| 312 | + test_path=test_path, |
| 313 | + openai_cap=openai_cap, |
| 314 | + output_labeled_dir=str(output_labeled_dir) if output_labeled_dir else None, |
| 315 | + pod_wait_timeout_sec=pod_wait_timeout_sec, |
| 316 | + vllm_ready_timeout_sec=vllm_ready_timeout_sec, |
| 317 | + ) |
| 318 | + |
| 319 | + |
188 | 320 | @task(name="train-student-task", log_prints=True) |
189 | 321 | def train_student_task( |
190 | 322 | labeled_path: str, |
@@ -354,8 +486,8 @@ def main() -> None: |
354 | 486 | parser = argparse.ArgumentParser(description="Prefect flows for summary KD pipeline (distill_by_prefect.md)") |
355 | 487 | parser.add_argument( |
356 | 488 | "flow", |
357 | | - choices=["build_dataset", "labeling", "train_student", "evaluate", "all"], |
358 | | - help="Flow to run", |
| 489 | + choices=["build_dataset", "labeling", "labeling_with_pod", "train_student", "evaluate", "all"], |
| 490 | + help="Flow to run (labeling_with_pod: Pod 생성→라벨링→삭제)", |
359 | 491 | ) |
360 | 492 | parser.add_argument("--input", type=Path, default=None, help="Input reviews JSON (default: tasteam_app_all_review_data.json)") |
361 | 493 | parser.add_argument("--out-dir", type=Path, default=None, help="Output root (default: distill_pipeline_output)") |
@@ -404,6 +536,35 @@ def main() -> None: |
404 | 536 | output_labeled_dir=out_dir, |
405 | 537 | ) |
406 | 538 | print("Result:", result) |
| 539 | + elif args.flow == "labeling_with_pod": |
| 540 | + if not args.train_path: |
| 541 | + parser.error("labeling_with_pod requires --train-path") |
| 542 | + ds_dir = out_dir / "datasets" |
| 543 | + val_p, test_p = None, None |
| 544 | + if args.val_path: |
| 545 | + val_p = str(args.val_path) |
| 546 | + elif ds_dir.exists(): |
| 547 | + for d in sorted(ds_dir.iterdir(), reverse=True): |
| 548 | + v = d / "val.json" |
| 549 | + if v.exists(): |
| 550 | + val_p = str(v) |
| 551 | + break |
| 552 | + if args.test_path: |
| 553 | + test_p = str(args.test_path) |
| 554 | + elif ds_dir.exists(): |
| 555 | + for d in sorted(ds_dir.iterdir(), reverse=True): |
| 556 | + t = d / "test.json" |
| 557 | + if t.exists(): |
| 558 | + test_p = str(t) |
| 559 | + break |
| 560 | + result = labeling_with_pod_flow( |
| 561 | + train_path=str(args.train_path), |
| 562 | + val_path=val_p, |
| 563 | + test_path=test_p, |
| 564 | + openai_cap=args.openai_cap, |
| 565 | + output_labeled_dir=out_dir, |
| 566 | + ) |
| 567 | + print("Result:", result) |
407 | 568 | elif args.flow == "train_student": |
408 | 569 | if not args.labeled_path: |
409 | 570 | parser.error("train_student requires --labeled-path") |
|
0 commit comments