Skip to content

Commit 2441d6c

Browse files
yossiovadiaclaude
andauthored
Convert step 08 to python (llm-d#272)
* Convert step 08 to python Fixes llm-d#268 - Convert 08_deploy_gaie.sh to Python implementation - Add extract_environment and get_image functions to setup/functions.py - Maintain equivalent functionality for GAIE deployment - Follow BASH_TO_PYTHON_CONVERSION.md guidelines * Fix TypeError in llmdbench_execute_cmd call Changed exit_on_failure to fatal parameter to match the correct function signature. Multiple model processing works correctly - when running with models "meta-llama/Llama-3.2-1B-Instruct,meta-llama/Llama-3.2-3B-Instruct" both models are processed, creating separate helmfile-00.yaml and helmfile-01.yaml files with distinct model labels. * Implement proper auto tag resolution in Python get_image function - Replace fallback to 'latest' with actual auto tag resolution using skopeo/podman - Match bash implementation exactly: use LLMDBENCH_CONTROL_CCMD to choose tool - Fix multi-model processing issue where script would exit after first model - Add debug output for multi-model processing progress Addresses PR feedback about auto tag resolution breaking existing workflows. Co-Authored-By: Claude <noreply@anthropic.com> --------- Co-authored-by: Claude <noreply@anthropic.com>
1 parent e11368e commit 2441d6c

File tree

2 files changed

+244
-0
lines changed

2 files changed

+244
-0
lines changed

setup/functions.py

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -484,3 +484,102 @@ def model_attribute(model: str, attribute: str) -> str:
484484
return result.lower()
485485
else:
486486
return result
487+
488+
489+
def extract_environment():
490+
"""
491+
Extract and display environment variables for debugging.
492+
Equivalent to the bash extract_environment function.
493+
"""
494+
495+
ev = {}
496+
for key, value in os.environ.items():
497+
if "LLMDBENCH_" in key:
498+
ev[key.split("LLMDBENCH_")[1].lower()] = value
499+
500+
# Get environment variables that start with LLMDBENCH, excluding sensitive ones
501+
env_vars = []
502+
for key, value in os.environ.items():
503+
if key.startswith("LLMDBENCH_") and not any(sensitive in key.upper() for sensitive in ["TOKEN", "USER", "PASSWORD", "EMAIL"]):
504+
env_vars.append(f"{key}={value}")
505+
506+
env_vars.sort()
507+
508+
# Check if environment variables have been displayed before
509+
envvar_displayed = int(os.environ.get("LLMDBENCH_CONTROL_ENVVAR_DISPLAYED", 0))
510+
511+
if envvar_displayed == 0:
512+
print("\n\nList of environment variables which will be used")
513+
for var in env_vars:
514+
print(var)
515+
print("\n\n")
516+
os.environ["LLMDBENCH_CONTROL_ENVVAR_DISPLAYED"] = "1"
517+
518+
# Write environment variables to file
519+
work_dir = os.environ.get("LLMDBENCH_CONTROL_WORK_DIR", ".")
520+
env_dir = Path(work_dir) / "environment"
521+
env_dir.mkdir(parents=True, exist_ok=True)
522+
523+
with open(env_dir / "variables", "w") as f:
524+
for var in env_vars:
525+
f.write(var + "\n")
526+
527+
528+
def get_image(image_registry: str, image_repo: str, image_name: str, image_tag: str, tag_only: str = "0") -> str:
529+
"""
530+
Construct container image reference.
531+
Equivalent to the bash get_image function.
532+
533+
Args:
534+
image_registry: Container registry
535+
image_repo: Repository/organization
536+
image_name: Image name
537+
image_tag: Image tag
538+
tag_only: If "1", return only the tag
539+
540+
Returns:
541+
Full image reference or just tag
542+
"""
543+
is_latest_tag = image_tag
544+
545+
if image_tag == "auto":
546+
ccmd = os.getenv("LLMDBENCH_CONTROL_CCMD", "skopeo")
547+
image_full_name = f"{image_registry}/{image_repo}/{image_name}"
548+
549+
if ccmd == "podman":
550+
# Use podman search to get latest tag
551+
cmd = f"{ccmd} search --list-tags {image_full_name}"
552+
try:
553+
result = subprocess.run(cmd.split(), capture_output=True, text=True, check=False)
554+
if result.returncode == 0:
555+
lines = result.stdout.strip().split('\n')
556+
if len(lines) > 0:
557+
# Get the last line and extract the tag (second column)
558+
last_line = lines[-1]
559+
parts = last_line.split()
560+
if len(parts) >= 2:
561+
is_latest_tag = parts[1]
562+
# The || true part in bash means we don't fail if command fails
563+
except:
564+
pass
565+
else:
566+
# Use skopeo to get latest tag
567+
cmd = f"skopeo list-tags docker://{image_full_name}"
568+
try:
569+
result = subprocess.run(cmd.split(), capture_output=True, text=True, check=True)
570+
import json
571+
tags_data = json.loads(result.stdout)
572+
if tags_data.get("Tags"):
573+
# Use jq -r .Tags[] | tail -1 equivalent
574+
is_latest_tag = tags_data["Tags"][-1]
575+
except:
576+
is_latest_tag = ""
577+
578+
if not is_latest_tag:
579+
announce(f"❌ Unable to find latest tag for image \"{image_full_name}\"")
580+
sys.exit(1)
581+
582+
if tag_only == "1":
583+
return is_latest_tag
584+
else:
585+
return f"{image_registry}/{image_repo}/{image_name}:{is_latest_tag}"

setup/steps/08_deploy_gaie.py

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
#!/usr/bin/env python3
2+
3+
import os
4+
import sys
5+
from pathlib import Path
6+
7+
# Add project root to Python path
8+
current_file = Path(__file__).resolve()
9+
project_root = current_file.parents[1]
10+
sys.path.insert(0, str(project_root))
11+
12+
# Import from functions.py
13+
from functions import announce, llmdbench_execute_cmd, model_attribute, extract_environment, get_image
14+
15+
16+
def main():
17+
"""Deploy GAIE (Gateway API Inference Extension) components."""
18+
os.environ["CURRENT_STEP_NAME"] = os.path.splitext(os.path.basename(__file__))[0]
19+
20+
# Parse environment variables
21+
ev = {}
22+
for key in dict(os.environ).keys():
23+
if "LLMDBENCH_" in key:
24+
ev.update({key.split("LLMDBENCH_")[1].lower(): os.environ.get(key)})
25+
26+
# Check if modelservice environment is active
27+
if int(ev.get("control_environment_type_modelservice_active", 0)) == 1:
28+
extract_environment()
29+
30+
model_number = 0
31+
model_list = ev.get("deploy_model_list", "").replace(",", " ").split()
32+
33+
for model in model_list:
34+
announce(f"🔄 Processing model {model_number + 1}/{len(model_list)}: {model}")
35+
36+
# Get model attribute
37+
model_id_label = model_attribute(model, "modelid_label")
38+
os.environ["LLMDBENCH_DEPLOY_CURRENT_MODEL_ID_LABEL"] = model_id_label
39+
40+
# Format model number with zero padding
41+
model_num = f"{model_number:02d}"
42+
43+
# Create directory structure
44+
helm_dir = Path(ev["control_work_dir"]) / "setup" / "helm" / ev["vllm_modelservice_release"] / model_num
45+
helm_dir.mkdir(parents=True, exist_ok=True)
46+
47+
# Read GAIE presets file content
48+
presets_path = Path(ev["vllm_modelservice_gaie_presets_full_path"])
49+
try:
50+
with open(presets_path, 'r') as f:
51+
presets_content = f.read()
52+
# Indent each line with 6 spaces for YAML formatting
53+
indented_presets = '\n'.join(f" {line}" for line in presets_content.splitlines())
54+
except FileNotFoundError:
55+
announce(f"⚠️ Warning: GAIE presets file not found at {presets_path}")
56+
indented_presets = ""
57+
58+
# Get image tag
59+
image_tag = get_image(
60+
ev["llmd_inferencescheduler_image_registry"],
61+
ev["llmd_inferencescheduler_image_repo"],
62+
ev["llmd_inferencescheduler_image_name"],
63+
ev["llmd_inferencescheduler_image_tag"],
64+
"1"
65+
)
66+
67+
# Generate GAIE values YAML content
68+
gaie_values_content = f"""inferenceExtension:
69+
replicas: 1
70+
image:
71+
name: {ev['llmd_inferencescheduler_image_name']}
72+
hub: {ev['llmd_inferencescheduler_image_registry']}/{ev['llmd_inferencescheduler_image_repo']}
73+
tag: {image_tag}
74+
pullPolicy: Always
75+
extProcPort: 9002
76+
pluginsConfigFile: "{ev['vllm_modelservice_gaie_presets']}"
77+
78+
# using upstream GIE default-plugins, see: https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/config/charts/inferencepool/templates/epp-config.yaml#L7C3-L56C33
79+
pluginsCustomConfig:
80+
{ev['vllm_modelservice_gaie_presets']}: |
81+
{indented_presets}
82+
inferencePool:
83+
targetPortNumber: {ev['vllm_common_inference_port']}
84+
modelServerType: vllm
85+
modelServers:
86+
matchLabels:
87+
llm-d.ai/inferenceServing: "true"
88+
llm-d.ai/model: {model_id_label}
89+
"""
90+
91+
# Write GAIE values file
92+
gaie_values_file = helm_dir / "gaie-values.yaml"
93+
with open(gaie_values_file, 'w') as f:
94+
f.write(gaie_values_content)
95+
96+
# Deploy helm chart via helmfile
97+
announce(f"🚀 Installing helm chart \"gaie-{ev['vllm_modelservice_release']}\" via helmfile...")
98+
helmfile_cmd = (
99+
f"helmfile --namespace {ev['vllm_common_namespace']} "
100+
f"--kubeconfig {ev['control_work_dir']}/environment/context.ctx "
101+
f"--selector name={ev['vllm_common_namespace']}-{model_id_label}-gaie "
102+
f"apply -f {ev['control_work_dir']}/setup/helm/{ev['vllm_modelservice_release']}/helmfile-{model_num}.yaml "
103+
f"--skip-diff-on-install"
104+
)
105+
106+
llmdbench_execute_cmd(
107+
actual_cmd=helmfile_cmd,
108+
dry_run=int(ev.get("control_dry_run", 0)),
109+
verbose=int(ev.get("control_verbose", 0))
110+
)
111+
112+
announce(f"✅ {ev['vllm_common_namespace']}-{model_id_label}-gaie helm chart deployed successfully")
113+
114+
# List relevant resources
115+
resource_list = "deployment,service,pods,secrets,inferencepools"
116+
if int(ev.get("control_deploy_is_openshift", 0)) == 1:
117+
resource_list += ",route"
118+
119+
announce(f"ℹ️ A snapshot of the relevant (model-specific) resources on namespace \"{ev['vllm_common_namespace']}\":")
120+
121+
if int(ev.get("control_dry_run", 0)) == 0:
122+
kubectl_cmd = f"{ev['control_kcmd']} get --namespace {ev['vllm_common_namespace']} {resource_list}"
123+
llmdbench_execute_cmd(
124+
actual_cmd=kubectl_cmd,
125+
dry_run=int(ev.get("control_dry_run", 0)),
126+
verbose=int(ev.get("control_verbose", 0)),
127+
fatal=False
128+
)
129+
130+
# Clean up environment variable
131+
if "LLMDBENCH_DEPLOY_CURRENT_MODEL_ID_LABEL" in os.environ:
132+
del os.environ["LLMDBENCH_DEPLOY_CURRENT_MODEL_ID_LABEL"]
133+
134+
model_number += 1
135+
136+
announce("✅ Completed model deployment")
137+
else:
138+
deploy_methods = ev.get("deploy_methods", "")
139+
announce(f"⏭️ Environment types are \"{deploy_methods}\". Skipping this step.")
140+
141+
return 0
142+
143+
144+
if __name__ == "__main__":
145+
sys.exit(main())

0 commit comments

Comments
 (0)