Skip to content

Commit bd2ebd7

Browse files
committed
try vllm eks test with new test path
1 parent 5793460 commit bd2ebd7

File tree

13 files changed

+854
-17
lines changed

13 files changed

+854
-17
lines changed

test/platforms/entrypoint.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from src.config import is_new_test_structure_enabled
44
from src.buildspec import Buildspec
55
from test.platforms.infra.ec2.setup import EC2Platform
6+
from test.platforms.infra.eks.setup import EKSPlatform
67
from test.test_utils import get_framework_from_image_uri
78

89

@@ -116,10 +117,7 @@ def main():
116117
raise
117118

118119
# Filter for applicable tests
119-
if test_type == "ec2":
120-
applicable_tests = [test for test in buildspec_data["tests"] if test["platform"].startswith("ec2")]
121-
else:
122-
applicable_tests = [test for test in buildspec_data["tests"] if test["platform"] == test_type]
120+
applicable_tests = [test for test in buildspec_data["tests"] if test["platform"].startswith(test_type)]
123121

124122
print(f"Found {len(buildspec_data['tests'])} test configurations")
125123
print(f"Found {len(applicable_tests)} applicable test configurations for {test_type}")
@@ -144,6 +142,22 @@ def main():
144142
except Exception as e:
145143
print(f"Test failed: {e}")
146144
raise
145+
elif test_type == "eks" and platform_name.startswith("eks"):
146+
print(f"Executing EKS test for platform: {platform_name}")
147+
platform = EKSPlatform()
148+
try:
149+
setup_params = {**test_config["params"], **buildspec_data["globals"]}
150+
print(f"Setup parameters: {setup_params}")
151+
platform.setup(setup_params)
152+
print(f"Platform setup completed")
153+
154+
print(f"Executing {len(test_config['run'])} commands:")
155+
for cmd in test_config["run"]:
156+
print(f" - {cmd}")
157+
platform.execute_command(cmd)
158+
except Exception as e:
159+
print(f"Test failed: {e}")
160+
raise
147161
else:
148162
print(f"Skipping test config {i+1}: test_type={test_type}, platform={platform_name}")
149163

test/platforms/infra/eks/__init__.py

Whitespace-only changes.

test/platforms/infra/eks/setup.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
import os
2+
from invoke import run
3+
from test.test_utils import LOGGER
4+
5+
class EKSPlatform:
6+
def __init__(self):
7+
self.resources = None
8+
self.region = os.getenv("REGION", "us-west-2")
9+
self.build_context = os.getenv("BUILD_CONTEXT")
10+
self.cluster_name = None
11+
self.namespace = None
12+
13+
def setup(self, params):
14+
"""
15+
Setup EKS infrastructure and return any resources needed for tests
16+
"""
17+
print(f"Setting up EKS platform with params: {params}")
18+
19+
framework = params.get("framework")
20+
cluster_prefix = params.get("cluster")
21+
self.cluster_name = f"{cluster_prefix}-{self.build_context}"
22+
self.namespace = params.get("namespace")
23+
24+
print(f"EKS Platform - Framework: {framework}")
25+
print(f"EKS Platform - Cluster: {self.cluster_name}")
26+
print(f"EKS Platform - Namespace: {self.namespace}")
27+
28+
if not os.getenv("DLC_IMAGE"):
29+
raise ValueError("DLC_IMAGE environment variable not set")
30+
31+
def execute_command(self, cmd):
32+
"""
33+
Execute a test command with proper environment setup
34+
"""
35+
env = {
36+
"AWS_REGION": self.region,
37+
"CLUSTER_NAME": self.cluster_name,
38+
"NAMESPACE": self.namespace,
39+
"BUILD_CONTEXT": self.build_context,
40+
"DLC_IMAGE": os.getenv("DLC_IMAGE"),
41+
}
42+
43+
LOGGER.info(f"Executing command with EKS environment: {cmd}")
44+
run(cmd, env=env)

test/platforms/infra/eks/vllm/__init__.py

Whitespace-only changes.
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
apiVersion: eksctl.io/v1alpha5
2+
kind: ClusterConfig
3+
4+
metadata:
5+
name: ${CLUSTER_NAME}
6+
region: ${AWS_REGION}
7+
version: "${EKS_VERSION}" # Latest stable EKS version
8+
9+
# Specify availability zones
10+
availabilityZones: ["${AWS_REGION}a", "${AWS_REGION}b"]
11+
12+
# Enable CloudWatch logging
13+
cloudWatch:
14+
clusterLogging:
15+
enableTypes: ["api", "audit", "authenticator", "controllerManager", "scheduler"]
16+
17+
# Add-ons for the cluster
18+
addons:
19+
- name: vpc-cni
20+
version: latest
21+
- name: coredns
22+
version: latest
23+
- name: kube-proxy
24+
version: latest
25+
- name: aws-ebs-csi-driver
26+
version: latest
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
apiVersion: v1
2+
kind: PersistentVolume
3+
metadata:
4+
name: fsx-lustre-pv
5+
spec:
6+
capacity:
7+
storage: 1200Gi # Adjust based on your FSx Lustre filesystem size
8+
volumeMode: Filesystem
9+
accessModes:
10+
- ReadWriteMany
11+
persistentVolumeReclaimPolicy: Retain
12+
storageClassName: fsx-sc
13+
csi:
14+
driver: fsx.csi.aws.com
15+
volumeHandle: <fs-id> # FSx Lustre filesystem ID
16+
volumeAttributes:
17+
dnsname: <dns-name> # FSx Lustre DNS name
18+
mountname: <mount-name> # The mount name of your FSx Lustre filesyst
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
apiVersion: v1
2+
kind: PersistentVolumeClaim
3+
metadata:
4+
name: fsx-lustre-pvc
5+
namespace: vllm
6+
spec:
7+
accessModes:
8+
- ReadWriteMany
9+
storageClassName: fsx-sc
10+
resources:
11+
requests:
12+
storage: 1200Gi # Should match the PV capacity
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
apiVersion: storage.k8s.io/v1
2+
kind: StorageClass
3+
metadata:
4+
name: fsx-sc
5+
provisioner: fsx.csi.aws.com
6+
parameters:
7+
subnetId: <subnet-id>
8+
securityGroupIds: <sg-id>
9+
deploymentType: SCRATCH_2
10+
automaticBackupRetentionDays: "0"
11+
dailyAutomaticBackupStartTime: "00:00"
12+
copyTagsToBackups: "false"
13+
perUnitStorageThroughput: "50"
14+
dataCompressionType: "NONE"
15+
reclaimPolicy: Retain
16+
volumeBindingMode: Immediate
17+
mountOptions:
18+
- flock
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
apiVersion: eksctl.io/v1alpha5
2+
kind: ClusterConfig
3+
4+
metadata:
5+
name: ${CLUSTER_NAME}
6+
region: ${AWS_REGION}
7+
8+
managedNodeGroups:
9+
- name: vllm-p4d-nodes-efa
10+
instanceType: p4d.24xlarge
11+
minSize: 0
12+
maxSize: 2
13+
desiredCapacity: 0
14+
availabilityZones: ["${AWS_REGION}a"] # EFA-enabled nodegroups must have only one availability zone
15+
volumeSize: 100
16+
privateNetworking: true
17+
amiFamily: AmazonLinux2023
18+
ami: ami-009600863d7ec0b6a
19+
preBootstrapCommands:
20+
- dnf install -y lustre-client
21+
- lfs --version
22+
labels:
23+
role: large-model-worker
24+
nvidia.com/gpu: "true"
25+
k8s.amazonaws.com/accelerator: nvidia-gpu
26+
aws.amazon.com/efa: "true" # Add EFA label
27+
tags:
28+
nodegroup-role: large-model-worker
29+
k8s.io/cluster-autoscaler/node-template/label/role: large-model-worker
30+
k8s.io/cluster-autoscaler/enabled: "true"
31+
k8s.io/cluster-autoscaler/${CLUSTER_NAME}: "owned"
32+
k8s.io/cluster-autoscaler/node-template/resources/vpc.amazonaws.com/efa: "4"
33+
k8s.io/cluster-autoscaler/node-template/label/aws.amazon.com/efa: "true"
34+
iam:
35+
withAddonPolicies:
36+
autoScaler: true
37+
albIngress: true
38+
cloudWatch: true
39+
ebs: true
40+
imageBuilder: true
41+
# Enable EFA interfaces
42+
efaEnabled: true
43+
capacityReservation:
44+
capacityReservationTarget:
45+
capacityReservationID: "cr-08e4079a2d40aee96"
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
apiVersion: networking.k8s.io/v1
2+
kind: Ingress
3+
metadata:
4+
name: vllm-deepseek-32b-lws-ingress
5+
namespace: vllm
6+
annotations:
7+
# Use AWS Load Balancer Controller with ALB
8+
alb.ingress.kubernetes.io/scheme: internet-facing
9+
alb.ingress.kubernetes.io/target-type: ip
10+
alb.ingress.kubernetes.io/security-groups: <sg-id>
11+
alb.ingress.kubernetes.io/healthcheck-path: /health
12+
alb.ingress.kubernetes.io/healthcheck-port: '8000'
13+
alb.ingress.kubernetes.io/healthcheck-protocol: HTTP
14+
alb.ingress.kubernetes.io/listen-ports: '[{"HTTP": 80}]'
15+
alb.ingress.kubernetes.io/load-balancer-attributes: load_balancing.cross_zone.enabled=true
16+
# Specify ALB class
17+
kubernetes.io/ingress.class: alb
18+
spec:
19+
ingressClassName: alb
20+
rules:
21+
- http:
22+
paths:
23+
- path: /
24+
pathType: Prefix
25+
backend:
26+
service:
27+
name: vllm-deepseek-32b-lws-leader
28+
port:
29+
number: 8000

0 commit comments

Comments
 (0)