@@ -288,7 +288,6 @@ def efa_ec2_instances(
288288 raise Exception (f"Error allocating elastic IP: { str (e )} " )
289289
290290 connections = setup_test_artifacts (ec2_client , instances , key_filename , region )
291- print ("connections" , connections )
292291 return_val = {
293292 "instances" : [
294293 (instance_info ["InstanceId" ], key_filename ) for instance_info in instances
@@ -374,29 +373,22 @@ def ec2_test_environment():
374373 LOGGER .error (f"Error during cleanup: { str (cleanup_error )} " )
375374
376375
377- def _setup_instance (connections , fsx_dns_name , mount_name ):
376+ def _setup_instance (connection , fsx_dns_name , mount_name ):
378377 """
379378 Setup FSx mount and VLLM environment on an instance synchronously
380379 """
381- master_connection = connections [ 0 ]
382- os . chdir ( ".. " )
380+ # Copy script to instance
381+ connection . put ( "vllm/ec2/utils/setup_fsx_vllm.sh" , "/home/ec2-user/setup_fsx_vllm.sh " )
383382
384- master_connection .put ("vllm/ec2/utils/setup_fsx_vllm.sh" , "/home/ec2-user/setup_fsx_vllm.sh" )
385-
386- master_conn_commands = [
383+ # Make script executable and run it
384+ commands = [
387385 "chmod +x /home/ec2-user/setup_fsx_vllm.sh" ,
388386 f"/home/ec2-user/setup_fsx_vllm.sh { fsx_dns_name } { mount_name } " ,
389387 ]
390- master_connection .run ("; " .join (master_conn_commands ))
391388
392- # Create mount directory and mount FSx
393- worker_conn_commands = [
394- "sudo yum install -y lustre-client" ,
395- "sudo mkdir -p /fsx" ,
396- f"sudo mount -t lustre -o relatime,flock { fsx_dns_name } @tcp:/{ mount_name } /fsx" ,
397- ]
398- worker_connection = connections [1 ]
399- worker_connection .run ("; " .join (worker_conn_commands ))
389+ # Execute commands synchronously
390+ result = connection .run ("; " .join (commands ))
391+ return result
400392
401393
402394def cleanup_resources (ec2_cli , resources , fsx ):
@@ -528,6 +520,52 @@ def configure_security_groups(instance_id, ec2_cli, fsx, vpc_id, instances_info)
528520 raise
529521
530522
523+ def setup_instance (instance_id , key_filename , ec2_cli , fsx_dns_name , mount_name ):
524+ """Setup FSx mount on a single instance"""
525+ instance_details = ec2_cli .describe_instances (InstanceIds = [instance_id ])["Reservations" ][0 ][
526+ "Instances"
527+ ][0 ]
528+ public_ip = instance_details .get ("PublicIpAddress" )
529+
530+ if not public_ip :
531+ raise Exception (f"No public IP found for instance { instance_id } " )
532+
533+ connection = Connection (
534+ host = public_ip ,
535+ user = "ec2-user" ,
536+ connect_kwargs = {"key_filename" : key_filename },
537+ )
538+
539+ return _setup_instance (connection , fsx_dns_name , mount_name )
540+
541+
542+ def mount_fsx_on_worker (instance_id , key_filename , ec2_cli , fsx_dns_name , mount_name ):
543+ """Mount FSx on worker instance without running setup script"""
544+ instance_details = ec2_cli .describe_instances (InstanceIds = [instance_id ])["Reservations" ][0 ][
545+ "Instances"
546+ ][0 ]
547+ public_ip = instance_details .get ("PublicIpAddress" )
548+
549+ if not public_ip :
550+ raise Exception (f"No public IP found for instance { instance_id } " )
551+
552+ connection = Connection (
553+ host = public_ip ,
554+ user = "ec2-user" ,
555+ connect_kwargs = {"key_filename" : key_filename },
556+ )
557+
558+ # Create mount directory and mount FSx
559+ commands = [
560+ "sudo yum install -y lustre-client" ,
561+ "sudo mkdir -p /fsx" ,
562+ f"sudo mount -t lustre -o relatime,flock { fsx_dns_name } @tcp:/{ mount_name } /fsx" ,
563+ ]
564+
565+ for cmd in commands :
566+ connection .run (cmd )
567+
568+
531569def setup ():
532570 """Main setup function for VLLM on EC2 with FSx"""
533571 print ("Testing vllm on ec2........" )
@@ -562,12 +600,26 @@ def setup():
562600 )
563601 print ("Created FSx filesystem" )
564602
565- _setup_instance (
566- resources ["connections" ],
603+ master_instance_id , master_key_filename = resources ["instances_info" ][0 ]
604+ setup_instance (
605+ master_instance_id ,
606+ master_key_filename ,
607+ ec2_cli ,
608+ resources ["fsx_config" ]["dns_name" ],
609+ resources ["fsx_config" ]["mount_name" ],
610+ )
611+ print (f"Setup completed for master instance { master_instance_id } " )
612+
613+ # Mount FSx on worker node
614+ worker_instance_id , worker_key_filename = resources ["instances_info" ][1 ]
615+ mount_fsx_on_worker (
616+ worker_instance_id ,
617+ worker_key_filename ,
618+ ec2_cli ,
567619 resources ["fsx_config" ]["dns_name" ],
568620 resources ["fsx_config" ]["mount_name" ],
569621 )
570- print (f"Setup completed for master and worker instance" )
622+ print (f"FSx mounted on worker instance { worker_instance_id } " )
571623
572624 return resources
573625
0 commit comments