Convert step 08 to python (llm-d#272)

yossiovadia · claude · web-flow · commit 2441d6c29779 · 2025-08-19T09:14:49.000-04:00
* Convert step 08 to python Fixes llm-d#268 - Convert 08_deploy_gaie.sh to Python implementation - Add extract_environment and get_image functions to setup/functions.py - Maintain equivalent functionality for GAIE deployment - Follow BASH_TO_PYTHON_CONVERSION.md guidelines * Fix TypeError in llmdbench_execute_cmd call Changed exit_on_failure to fatal parameter to match the correct function signature. Multiple model processing works correctly - when running with models "meta-llama/Llama-3.2-1B-Instruct,meta-llama/Llama-3.2-3B-Instruct" both models are processed, creating separate helmfile-00.yaml and helmfile-01.yaml files with distinct model labels. * Implement proper auto tag resolution in Python get_image function - Replace fallback to 'latest' with actual auto tag resolution using skopeo/podman - Match bash implementation exactly: use LLMDBENCH_CONTROL_CCMD to choose tool - Fix multi-model processing issue where script would exit after first model - Add debug output for multi-model processing progress Addresses PR feedback about auto tag resolution breaking existing workflows. Co-Authored-By: Claude <noreply@anthropic.com> --------- Co-authored-by: Claude <noreply@anthropic.com>
diff --git a/setup/functions.py b/setup/functions.py
@@ -484,3 +484,102 @@ def model_attribute(model: str, attribute: str) -> str:
         return result.lower()
     else:
         return result
+
+
+def extract_environment():
+    """
+    Extract and display environment variables for debugging.
+    Equivalent to the bash extract_environment function.
+    """
+    
+    ev = {}
+    for key, value in os.environ.items():
+        if "LLMDBENCH_" in key:
+            ev[key.split("LLMDBENCH_")[1].lower()] = value
+    
+    # Get environment variables that start with LLMDBENCH, excluding sensitive ones
+    env_vars = []
+    for key, value in os.environ.items():
+        if key.startswith("LLMDBENCH_") and not any(sensitive in key.upper() for sensitive in ["TOKEN", "USER", "PASSWORD", "EMAIL"]):
+            env_vars.append(f"{key}={value}")
+    
+    env_vars.sort()
+    
+    # Check if environment variables have been displayed before
+    envvar_displayed = int(os.environ.get("LLMDBENCH_CONTROL_ENVVAR_DISPLAYED", 0))
+    
+    if envvar_displayed == 0:
+        print("\n\nList of environment variables which will be used")
+        for var in env_vars:
+            print(var)
+        print("\n\n")
+        os.environ["LLMDBENCH_CONTROL_ENVVAR_DISPLAYED"] = "1"
+    
+    # Write environment variables to file
+    work_dir = os.environ.get("LLMDBENCH_CONTROL_WORK_DIR", ".")
+    env_dir = Path(work_dir) / "environment"
+    env_dir.mkdir(parents=True, exist_ok=True)
+    
+    with open(env_dir / "variables", "w") as f:
+        for var in env_vars:
+            f.write(var + "\n")
+
+
+def get_image(image_registry: str, image_repo: str, image_name: str, image_tag: str, tag_only: str = "0") -> str:
+    """
+    Construct container image reference.
+    Equivalent to the bash get_image function.
+    
+    Args:
+        image_registry: Container registry
+        image_repo: Repository/organization  
+        image_name: Image name
+        image_tag: Image tag
+        tag_only: If "1", return only the tag
+    
+    Returns:
+        Full image reference or just tag
+    """
+    is_latest_tag = image_tag
+    
+    if image_tag == "auto":
+        ccmd = os.getenv("LLMDBENCH_CONTROL_CCMD", "skopeo")
+        image_full_name = f"{image_registry}/{image_repo}/{image_name}"
+        
+        if ccmd == "podman":
+            # Use podman search to get latest tag
+            cmd = f"{ccmd} search --list-tags {image_full_name}"
+            try:
+                result = subprocess.run(cmd.split(), capture_output=True, text=True, check=False)
+                if result.returncode == 0:
+                    lines = result.stdout.strip().split('\n')
+                    if len(lines) > 0:
+                        # Get the last line and extract the tag (second column)
+                        last_line = lines[-1]
+                        parts = last_line.split()
+                        if len(parts) >= 2:
+                            is_latest_tag = parts[1]
+                # The || true part in bash means we don't fail if command fails
+            except:
+                pass
+        else:
+            # Use skopeo to get latest tag
+            cmd = f"skopeo list-tags docker://{image_full_name}"
+            try:
+                result = subprocess.run(cmd.split(), capture_output=True, text=True, check=True)
+                import json
+                tags_data = json.loads(result.stdout)
+                if tags_data.get("Tags"):
+                    # Use jq -r .Tags[] | tail -1 equivalent
+                    is_latest_tag = tags_data["Tags"][-1]
+            except:
+                is_latest_tag = ""
+        
+        if not is_latest_tag:
+            announce(f"❌ Unable to find latest tag for image \"{image_full_name}\"")
+            sys.exit(1)
+    
+    if tag_only == "1":
+        return is_latest_tag
+    else:
+        return f"{image_registry}/{image_repo}/{image_name}:{is_latest_tag}"
diff --git a/setup/steps/08_deploy_gaie.py b/setup/steps/08_deploy_gaie.py
@@ -0,0 +1,145 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+from pathlib import Path
+
+# Add project root to Python path
+current_file = Path(__file__).resolve()
+project_root = current_file.parents[1]
+sys.path.insert(0, str(project_root))
+
+# Import from functions.py
+from functions import announce, llmdbench_execute_cmd, model_attribute, extract_environment, get_image
+
+
+def main():
+    """Deploy GAIE (Gateway API Inference Extension) components."""
+    os.environ["CURRENT_STEP_NAME"] = os.path.splitext(os.path.basename(__file__))[0]
+    
+    # Parse environment variables
+    ev = {}
+    for key in dict(os.environ).keys():
+        if "LLMDBENCH_" in key:
+            ev.update({key.split("LLMDBENCH_")[1].lower(): os.environ.get(key)})
+    
+    # Check if modelservice environment is active
+    if int(ev.get("control_environment_type_modelservice_active", 0)) == 1:
+        extract_environment()
+        
+        model_number = 0
+        model_list = ev.get("deploy_model_list", "").replace(",", " ").split()
+        
+        for model in model_list:
+            announce(f"🔄 Processing model {model_number + 1}/{len(model_list)}: {model}")
+            
+            # Get model attribute
+            model_id_label = model_attribute(model, "modelid_label")
+            os.environ["LLMDBENCH_DEPLOY_CURRENT_MODEL_ID_LABEL"] = model_id_label
+            
+            # Format model number with zero padding
+            model_num = f"{model_number:02d}"
+            
+            # Create directory structure
+            helm_dir = Path(ev["control_work_dir"]) / "setup" / "helm" / ev["vllm_modelservice_release"] / model_num
+            helm_dir.mkdir(parents=True, exist_ok=True)
+            
+            # Read GAIE presets file content
+            presets_path = Path(ev["vllm_modelservice_gaie_presets_full_path"])
+            try:
+                with open(presets_path, 'r') as f:
+                    presets_content = f.read()
+                # Indent each line with 6 spaces for YAML formatting
+                indented_presets = '\n'.join(f"      {line}" for line in presets_content.splitlines())
+            except FileNotFoundError:
+                announce(f"⚠️ Warning: GAIE presets file not found at {presets_path}")
+                indented_presets = ""
+            
+            # Get image tag
+            image_tag = get_image(
+                ev["llmd_inferencescheduler_image_registry"],
+                ev["llmd_inferencescheduler_image_repo"], 
+                ev["llmd_inferencescheduler_image_name"],
+                ev["llmd_inferencescheduler_image_tag"],
+                "1"
+            )
+            
+            # Generate GAIE values YAML content
+            gaie_values_content = f"""inferenceExtension:
+  replicas: 1
+  image:
+    name: {ev['llmd_inferencescheduler_image_name']}
+    hub: {ev['llmd_inferencescheduler_image_registry']}/{ev['llmd_inferencescheduler_image_repo']}
+    tag: {image_tag}
+    pullPolicy: Always
+  extProcPort: 9002
+  pluginsConfigFile: "{ev['vllm_modelservice_gaie_presets']}"
+
+  # using upstream GIE default-plugins, see: https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/config/charts/inferencepool/templates/epp-config.yaml#L7C3-L56C33
+  pluginsCustomConfig:
+    {ev['vllm_modelservice_gaie_presets']}: |
+{indented_presets}
+inferencePool:
+  targetPortNumber: {ev['vllm_common_inference_port']}
+  modelServerType: vllm
+  modelServers:
+    matchLabels:
+      llm-d.ai/inferenceServing: "true"
+      llm-d.ai/model: {model_id_label}
+"""
+            
+            # Write GAIE values file
+            gaie_values_file = helm_dir / "gaie-values.yaml"
+            with open(gaie_values_file, 'w') as f:
+                f.write(gaie_values_content)
+            
+            # Deploy helm chart via helmfile
+            announce(f"🚀 Installing helm chart \"gaie-{ev['vllm_modelservice_release']}\" via helmfile...")
+            helmfile_cmd = (
+                f"helmfile --namespace {ev['vllm_common_namespace']} "
+                f"--kubeconfig {ev['control_work_dir']}/environment/context.ctx "
+                f"--selector name={ev['vllm_common_namespace']}-{model_id_label}-gaie "
+                f"apply -f {ev['control_work_dir']}/setup/helm/{ev['vllm_modelservice_release']}/helmfile-{model_num}.yaml "
+                f"--skip-diff-on-install"
+            )
+            
+            llmdbench_execute_cmd(
+                actual_cmd=helmfile_cmd,
+                dry_run=int(ev.get("control_dry_run", 0)),
+                verbose=int(ev.get("control_verbose", 0))
+            )
+            
+            announce(f"✅ {ev['vllm_common_namespace']}-{model_id_label}-gaie helm chart deployed successfully")
+            
+            # List relevant resources
+            resource_list = "deployment,service,pods,secrets,inferencepools"
+            if int(ev.get("control_deploy_is_openshift", 0)) == 1:
+                resource_list += ",route"
+            
+            announce(f"ℹ️ A snapshot of the relevant (model-specific) resources on namespace \"{ev['vllm_common_namespace']}\":")
+            
+            if int(ev.get("control_dry_run", 0)) == 0:
+                kubectl_cmd = f"{ev['control_kcmd']} get --namespace {ev['vllm_common_namespace']} {resource_list}"
+                llmdbench_execute_cmd(
+                    actual_cmd=kubectl_cmd,
+                    dry_run=int(ev.get("control_dry_run", 0)),
+                    verbose=int(ev.get("control_verbose", 0)),
+                    fatal=False
+                )
+            
+            # Clean up environment variable
+            if "LLMDBENCH_DEPLOY_CURRENT_MODEL_ID_LABEL" in os.environ:
+                del os.environ["LLMDBENCH_DEPLOY_CURRENT_MODEL_ID_LABEL"]
+            
+            model_number += 1
+        
+        announce("✅ Completed model deployment")
+    else:
+        deploy_methods = ev.get("deploy_methods", "")
+        announce(f"⏭️ Environment types are \"{deploy_methods}\". Skipping this step.")
+    
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())