Skip to content

Commit 072c790

Browse files
test efa
1 parent 9f3ca48 commit 072c790

File tree

1 file changed

+71
-19
lines changed

1 file changed

+71
-19
lines changed

test/vllm/ec2/infra/setup_ec2.py

Lines changed: 71 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,6 @@ def efa_ec2_instances(
288288
raise Exception(f"Error allocating elastic IP: {str(e)}")
289289

290290
connections = setup_test_artifacts(ec2_client, instances, key_filename, region)
291-
print("connections", connections)
292291
return_val = {
293292
"instances": [
294293
(instance_info["InstanceId"], key_filename) for instance_info in instances
@@ -374,29 +373,22 @@ def ec2_test_environment():
374373
LOGGER.error(f"Error during cleanup: {str(cleanup_error)}")
375374

376375

377-
def _setup_instance(connections, fsx_dns_name, mount_name):
376+
def _setup_instance(connection, fsx_dns_name, mount_name):
378377
"""
379378
Setup FSx mount and VLLM environment on an instance synchronously
380379
"""
381-
master_connection = connections[0]
382-
os.chdir("..")
380+
# Copy script to instance
381+
connection.put("vllm/ec2/utils/setup_fsx_vllm.sh", "/home/ec2-user/setup_fsx_vllm.sh")
383382

384-
master_connection.put("vllm/ec2/utils/setup_fsx_vllm.sh", "/home/ec2-user/setup_fsx_vllm.sh")
385-
386-
master_conn_commands = [
383+
# Make script executable and run it
384+
commands = [
387385
"chmod +x /home/ec2-user/setup_fsx_vllm.sh",
388386
f"/home/ec2-user/setup_fsx_vllm.sh {fsx_dns_name} {mount_name}",
389387
]
390-
master_connection.run("; ".join(master_conn_commands))
391388

392-
# Create mount directory and mount FSx
393-
worker_conn_commands = [
394-
"sudo yum install -y lustre-client",
395-
"sudo mkdir -p /fsx",
396-
f"sudo mount -t lustre -o relatime,flock {fsx_dns_name}@tcp:/{mount_name} /fsx",
397-
]
398-
worker_connection = connections[1]
399-
worker_connection.run("; ".join(worker_conn_commands))
389+
# Execute commands synchronously
390+
result = connection.run("; ".join(commands))
391+
return result
400392

401393

402394
def cleanup_resources(ec2_cli, resources, fsx):
@@ -528,6 +520,52 @@ def configure_security_groups(instance_id, ec2_cli, fsx, vpc_id, instances_info)
528520
raise
529521

530522

523+
def setup_instance(instance_id, key_filename, ec2_cli, fsx_dns_name, mount_name):
524+
"""Setup FSx mount on a single instance"""
525+
instance_details = ec2_cli.describe_instances(InstanceIds=[instance_id])["Reservations"][0][
526+
"Instances"
527+
][0]
528+
public_ip = instance_details.get("PublicIpAddress")
529+
530+
if not public_ip:
531+
raise Exception(f"No public IP found for instance {instance_id}")
532+
533+
connection = Connection(
534+
host=public_ip,
535+
user="ec2-user",
536+
connect_kwargs={"key_filename": key_filename},
537+
)
538+
539+
return _setup_instance(connection, fsx_dns_name, mount_name)
540+
541+
542+
def mount_fsx_on_worker(instance_id, key_filename, ec2_cli, fsx_dns_name, mount_name):
543+
"""Mount FSx on worker instance without running setup script"""
544+
instance_details = ec2_cli.describe_instances(InstanceIds=[instance_id])["Reservations"][0][
545+
"Instances"
546+
][0]
547+
public_ip = instance_details.get("PublicIpAddress")
548+
549+
if not public_ip:
550+
raise Exception(f"No public IP found for instance {instance_id}")
551+
552+
connection = Connection(
553+
host=public_ip,
554+
user="ec2-user",
555+
connect_kwargs={"key_filename": key_filename},
556+
)
557+
558+
# Create mount directory and mount FSx
559+
commands = [
560+
"sudo yum install -y lustre-client",
561+
"sudo mkdir -p /fsx",
562+
f"sudo mount -t lustre -o relatime,flock {fsx_dns_name}@tcp:/{mount_name} /fsx",
563+
]
564+
565+
for cmd in commands:
566+
connection.run(cmd)
567+
568+
531569
def setup():
532570
"""Main setup function for VLLM on EC2 with FSx"""
533571
print("Testing vllm on ec2........")
@@ -562,12 +600,26 @@ def setup():
562600
)
563601
print("Created FSx filesystem")
564602

565-
_setup_instance(
566-
resources["connections"],
603+
master_instance_id, master_key_filename = resources["instances_info"][0]
604+
setup_instance(
605+
master_instance_id,
606+
master_key_filename,
607+
ec2_cli,
608+
resources["fsx_config"]["dns_name"],
609+
resources["fsx_config"]["mount_name"],
610+
)
611+
print(f"Setup completed for master instance {master_instance_id}")
612+
613+
# Mount FSx on worker node
614+
worker_instance_id, worker_key_filename = resources["instances_info"][1]
615+
mount_fsx_on_worker(
616+
worker_instance_id,
617+
worker_key_filename,
618+
ec2_cli,
567619
resources["fsx_config"]["dns_name"],
568620
resources["fsx_config"]["mount_name"],
569621
)
570-
print(f"Setup completed for master and worker instance")
622+
print(f"FSx mounted on worker instance {worker_instance_id}")
571623

572624
return resources
573625

0 commit comments

Comments
 (0)