@@ -55,13 +55,6 @@ def create_benchmark_command(model_name: str) -> str:
5555 """
5656
5757
58- def create_serve_command (model_name : str ) -> str :
59- return f"""vllm serve { model_name } \
60- --tensor-parallel-size 8 \
61- --pipeline-parallel-size 2 \
62- --max-num-batched-tokens 16384"""
63-
64-
6558def get_secret_hf_token ():
6659
6760 secret_name = "test/hf_token"
@@ -163,9 +156,16 @@ def test_vllm_benchmark_on_multi_node(head_connection, worker_connection, image_
163156 head_container_id = get_container_id (head_connection , image_uri )
164157 print ("Starting model serving inside Ray container..." )
165158
166- serve_cmd = create_serve_command (model_name )
167- serve_in_container = f"tmux new-session -d -s ray_head 'docker exec -it { head_container_id } /bin/bash -c \" { serve_cmd } \" '"
168- head_connection .run (serve_in_container )
159+ commands_serving = [
160+ "tmux new-session -d -s vllm_serve" ,
161+ "tmux ls" ,
162+ "tmux attach-session -t vllm_serve" ,
163+ f'docker exec -it { head_container_id } /bin/bash -c "vllm serve { model_name } \
164+ --tensor-parallel-size 8 \
165+ --pipeline-parallel-size 2 \
166+ --max-num-batched-tokens 16384"' ,
167+ ]
168+ head_connection .run ("; " .join (commands_serving ))
169169
170170 print ("Waiting for model to load (15 minutes)..." )
171171 time .sleep (1000 )
0 commit comments