Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions buildkite/pipeline_generator/buildkite_step.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from pydantic import BaseModel
from typing import Dict, List, Optional, Any, Union
from step import Step
from utils_lib.docker_utils import get_image, get_ecr_cache_registry
from utils_lib.docker_utils import get_image, resolve_ecr_cache_vars
from global_config import get_global_config
from plugin.k8s_plugin import get_k8s_plugin
from plugin.docker_plugin import get_docker_plugin
Expand Down Expand Up @@ -87,7 +87,7 @@ def _get_variables_to_inject() -> Dict[str, str]:
if global_config["name"] != "vllm_ci":
return {}

cache_from_tag, cache_to_tag = get_ecr_cache_registry()
cache_from, cache_from_base_branch, cache_from_main, cache_to = resolve_ecr_cache_vars()
return {
"$REGISTRY": global_config["registries"],
"$REPO": global_config["repositories"]["main"]
Expand All @@ -97,8 +97,10 @@ def _get_variables_to_inject() -> Dict[str, str]:
"$BRANCH": global_config["branch"],
"$VLLM_USE_PRECOMPILED": "1" if global_config["use_precompiled"] else "0",
"$VLLM_MERGE_BASE_COMMIT": global_config["merge_base_commit"],
"$CACHE_FROM": cache_from_tag,
"$CACHE_TO": cache_to_tag,
"$CACHE_FROM": cache_from,
"$CACHE_FROM_BASE_BRANCH": cache_from_base_branch,
"$CACHE_FROM_MAIN": cache_from_main,
"$CACHE_TO": cache_to,
}


Expand Down
121 changes: 52 additions & 69 deletions buildkite/pipeline_generator/utils_lib/docker_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,80 +21,63 @@ def get_image(cpu: bool = False) -> str:
return image


def _clean_docker_tag(tag: str) -> str:
# Only allows alphanumeric, dashes and underscores for Docker tags, and replaces others with '-'
return re.sub(r"[^a-zA-Z0-9_.-]", "-", tag or "")
def clean_docker_tag(tag: str) -> str:
"""
Function to replace invalid characters in Docker image tags and truncate to 128 chars
Valid characters: a-z, A-Z, 0-9, _, ., -
"""
# Replace invalid characters with underscore and truncate to 128 chars
cleaned = re.sub(r"[^a-zA-Z0-9._-]", "_", tag or "")
return cleaned[:128]


def _docker_manifest_exists(image_tag: str) -> bool:
try:
subprocess.run(
["docker", "manifest", "inspect", image_tag],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
check=True,
)
return True
except subprocess.CalledProcessError:
return False


def get_ecr_cache_registry() -> Tuple[str, str]:
def resolve_ecr_cache_vars() -> Tuple[str, str, str, str]:
"""
Resolve ECR cache-from, cache-to using buildkite environment variables:
- BUILDKITE_BRANCH
- BUILDKITE_PULL_REQUEST
- BUILDKITE_PULL_REQUEST_BASE_BRANCH
Return tuple of:
- CACHE_FROM: primary cache source
- CACHE_FROM_BASE_BRANCH: secondary cache source
- CACHE_FROM_MAIN: fallback cache source
- CACHE_TO: cache destination
Note: CACHE_FROM, CACHE_FROM_BASE_BRANCH, CACHE_FROM_MAIN could be the same.
This is intended behavior to allow BuildKit to merge all possible cache source
to maximize cache hit potential.
"""
global_config = get_global_config()
branch = global_config["branch"]
pull_request = global_config["pull_request"]

# Define ECR repository URLs for test and main cache
test_cache_ecr = "936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-test-cache"
postmerge_cache_ecr = (
"936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-postmerge-cache"
)
cache_from_tag, cache_to_tag = None, None
# Authenticate Docker to AWS ECR
login_cmd = ["aws", "ecr", "get-login-password", "--region", "us-east-1"]
try:
proc = subprocess.Popen(login_cmd, stdout=subprocess.PIPE)
subprocess.run(
[
"docker",
"login",
"--username",
"AWS",
"--password-stdin",
"936637512419.dkr.ecr.us-east-1.amazonaws.com",
],
stdin=proc.stdout,
check=True,
)
proc.stdout.close()
proc.wait()
except Exception as e:
raise RuntimeError(f"Failed to authenticate with AWS ECR: {e}")

if global_config["pull_request"]: # PR build
cache_to_tag = f"{test_cache_ecr}:pr-{global_config['pull_request']}"
if _docker_manifest_exists(cache_to_tag): # use PR cache if exists
cache_from_tag = cache_to_tag
elif (
os.getenv("BUILDKITE_PULL_REQUEST_BASE_BRANCH") != "main"
): # use base branch cache if exists
clean_base = _clean_docker_tag(
os.getenv("BUILDKITE_PULL_REQUEST_BASE_BRANCH")
)
if _docker_manifest_exists(f"{test_cache_ecr}:{clean_base}"):
cache_from_tag = f"{test_cache_ecr}:{clean_base}"
else: # fall back to postmerge cache ecr if base branch cache does not exist
cache_from_tag = f"{postmerge_cache_ecr}:latest"
main_cache_ecr = "936637512419.dkr.ecr.us-east-1.amazonaws.com/vllm-ci-postmerge-cache"

if not pull_request or pull_request == "false":
# Not a PR
if branch == "main":
cache = f"{main_cache_ecr}:latest"
else:
cache_from_tag = f"{postmerge_cache_ecr}:latest"
else: # non-PR build
if branch == "main": # postmerge
cache_to_tag = f"{postmerge_cache_ecr}:latest"
cache_from_tag = f"{postmerge_cache_ecr}:latest"
clean_branch = clean_docker_tag(branch)
cache = f"{test_cache_ecr}:{clean_branch}"

cache_to = cache
cache_from = cache
cache_from_base_branch = cache
else:
# PR build
cache_to = f"{test_cache_ecr}:pr-{pull_request}"
cache_from = f"{test_cache_ecr}:pr-{pull_request}"

base_branch = os.getenv("BUILDKITE_PULL_REQUEST_BASE_BRANCH", "main")
if base_branch == "main":
cache_from_base_branch = f"{main_cache_ecr}:latest"
else:
clean_branch = _clean_docker_tag(branch)
cache_to_tag = f"{test_cache_ecr}:{clean_branch}"
if _docker_manifest_exists(f"{test_cache_ecr}:{clean_branch}"):
cache_from_tag = f"{test_cache_ecr}:{clean_branch}"
else:
cache_from_tag = f"{postmerge_cache_ecr}:latest"
if not cache_from_tag or not cache_to_tag:
raise RuntimeError("Failed to get ECR cache tags")
return cache_from_tag, cache_to_tag
clean_base = clean_docker_tag(base_branch)
cache_from_base_branch = f"{test_cache_ecr}:{clean_base}"

cache_from_main = f"{main_cache_ecr}:latest"

return cache_from, cache_from_base_branch, cache_from_main, cache_to