diff --git a/buildkite/pipeline_generator/buildkite_step.py b/buildkite/pipeline_generator/buildkite_step.py index 01c2738d..c383073d 100644 --- a/buildkite/pipeline_generator/buildkite_step.py +++ b/buildkite/pipeline_generator/buildkite_step.py @@ -1,7 +1,7 @@ from pydantic import BaseModel from typing import Dict, List, Optional, Any, Union from step import Step -from utils_lib.docker_utils import get_image, get_ecr_cache_registry +from utils_lib.docker_utils import get_image, resolve_ecr_cache_vars from global_config import get_global_config from plugin.k8s_plugin import get_k8s_plugin from plugin.docker_plugin import get_docker_plugin @@ -87,7 +87,7 @@ def _get_variables_to_inject() -> Dict[str, str]: if global_config["name"] != "vllm_ci": return {} - cache_from_tag, cache_to_tag = get_ecr_cache_registry() + cache_from, cache_from_base_branch, cache_from_main, cache_to = resolve_ecr_cache_vars() return { "$REGISTRY": global_config["registries"], "$REPO": global_config["repositories"]["main"] @@ -97,8 +97,10 @@ def _get_variables_to_inject() -> Dict[str, str]: "$BRANCH": global_config["branch"], "$VLLM_USE_PRECOMPILED": "1" if global_config["use_precompiled"] else "0", "$VLLM_MERGE_BASE_COMMIT": global_config["merge_base_commit"], - "$CACHE_FROM": cache_from_tag, - "$CACHE_TO": cache_to_tag, + "$CACHE_FROM": cache_from, + "$CACHE_FROM_BASE_BRANCH": cache_from_base_branch, + "$CACHE_FROM_MAIN": cache_from_main, + "$CACHE_TO": cache_to, } diff --git a/buildkite/pipeline_generator/utils_lib/docker_utils.py b/buildkite/pipeline_generator/utils_lib/docker_utils.py index be0991d0..ff3a2306 100644 --- a/buildkite/pipeline_generator/utils_lib/docker_utils.py +++ b/buildkite/pipeline_generator/utils_lib/docker_utils.py @@ -21,80 +21,63 @@ def get_image(cpu: bool = False) -> str: return image -def _clean_docker_tag(tag: str) -> str: - # Only allows alphanumeric, dashes and underscores for Docker tags, and replaces others with '-' - return re.sub(r"[^a-zA-Z0-9_.-]", "-", tag or "") +def clean_docker_tag(tag: str) -> str: + """ + Function to replace invalid characters in Docker image tags and truncate to 128 chars + Valid characters: a-z, A-Z, 0-9, _, ., - + """ + # Replace invalid characters with underscore and truncate to 128 chars + cleaned = re.sub(r"[^a-zA-Z0-9._-]", "_", tag or "") + return cleaned[:128] -def _docker_manifest_exists(image_tag: str) -> bool: - try: - subprocess.run( - ["docker", "manifest", "inspect", image_tag], - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, - check=True, - ) - return True - except subprocess.CalledProcessError: - return False - -def get_ecr_cache_registry() -> Tuple[str, str]: +def resolve_ecr_cache_vars() -> Tuple[str, str, str, str]: + """ + Resolve ECR cache-from, cache-to using buildkite environment variables: + - BUILDKITE_BRANCH + - BUILDKITE_PULL_REQUEST + - BUILDKITE_PULL_REQUEST_BASE_BRANCH + Return tuple of: + - CACHE_FROM: primary cache source + - CACHE_FROM_BASE_BRANCH: secondary cache source + - CACHE_FROM_MAIN: fallback cache source + - CACHE_TO: cache destination + Note: CACHE_FROM, CACHE_FROM_BASE_BRANCH, CACHE_FROM_MAIN could be the same. + This is intended behavior to allow BuildKit to merge all possible cache source + to maximize cache hit potential. + """ global_config = get_global_config() branch = global_config["branch"] + pull_request = global_config["pull_request"] + + # Define ECR repository URLs for test and main cache test_cache_ecr = "936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-test-cache" - postmerge_cache_ecr = ( - "936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-postmerge-cache" - ) - cache_from_tag, cache_to_tag = None, None - # Authenticate Docker to AWS ECR - login_cmd = ["aws", "ecr", "get-login-password", "--region", "us-east-1"] - try: - proc = subprocess.Popen(login_cmd, stdout=subprocess.PIPE) - subprocess.run( - [ - "docker", - "login", - "--username", - "AWS", - "--password-stdin", - "936637512419.dkr.ecr.us-east-1.amazonaws.com", - ], - stdin=proc.stdout, - check=True, - ) - proc.stdout.close() - proc.wait() - except Exception as e: - raise RuntimeError(f"Failed to authenticate with AWS ECR: {e}") - - if global_config["pull_request"]: # PR build - cache_to_tag = f"{test_cache_ecr}:pr-{global_config['pull_request']}" - if _docker_manifest_exists(cache_to_tag): # use PR cache if exists - cache_from_tag = cache_to_tag - elif ( - os.getenv("BUILDKITE_PULL_REQUEST_BASE_BRANCH") != "main" - ): # use base branch cache if exists - clean_base = _clean_docker_tag( - os.getenv("BUILDKITE_PULL_REQUEST_BASE_BRANCH") - ) - if _docker_manifest_exists(f"{test_cache_ecr}:{clean_base}"): - cache_from_tag = f"{test_cache_ecr}:{clean_base}" - else: # fall back to postmerge cache ecr if base branch cache does not exist - cache_from_tag = f"{postmerge_cache_ecr}:latest" + main_cache_ecr = "936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-postmerge-cache" + + if not pull_request or pull_request == "false": + # Not a PR + if branch == "main": + cache = f"{main_cache_ecr}:latest" else: - cache_from_tag = f"{postmerge_cache_ecr}:latest" - else: # non-PR build - if branch == "main": # postmerge - cache_to_tag = f"{postmerge_cache_ecr}:latest" - cache_from_tag = f"{postmerge_cache_ecr}:latest" + clean_branch = clean_docker_tag(branch) + cache = f"{test_cache_ecr}:{clean_branch}" + + cache_to = cache + cache_from = cache + cache_from_base_branch = cache + else: + # PR build + cache_to = f"{test_cache_ecr}:pr-{pull_request}" + cache_from = f"{test_cache_ecr}:pr-{pull_request}" + + base_branch = os.getenv("BUILDKITE_PULL_REQUEST_BASE_BRANCH", "main") + if base_branch == "main": + cache_from_base_branch = f"{main_cache_ecr}:latest" else: - clean_branch = _clean_docker_tag(branch) - cache_to_tag = f"{test_cache_ecr}:{clean_branch}" - if _docker_manifest_exists(f"{test_cache_ecr}:{clean_branch}"): - cache_from_tag = f"{test_cache_ecr}:{clean_branch}" - else: - cache_from_tag = f"{postmerge_cache_ecr}:latest" - if not cache_from_tag or not cache_to_tag: - raise RuntimeError("Failed to get ECR cache tags") - return cache_from_tag, cache_to_tag + clean_base = clean_docker_tag(base_branch) + cache_from_base_branch = f"{test_cache_ecr}:{clean_base}" + + cache_from_main = f"{main_cache_ecr}:latest" + + return cache_from, cache_from_base_branch, cache_from_main, cache_to