11import os
22import time
33import pytest
4+ import logging
5+ import sys
46
57from src .benchmark_metrics import (
68 PYTORCH_INFERENCE_GPU_THRESHOLD ,
1012from test .test_utils import (
1113 CONTAINER_TESTS_PREFIX ,
1214 get_framework_and_version_from_tag ,
13- UL20_CPU_ARM64_US_WEST_2 ,
15+ UL22_BASE_ARM64_DLAMI_US_WEST_2 ,
1416 login_to_ecr_registry ,
1517 get_account_id_from_image_uri ,
1618 LOGGER ,
1719)
1820from test .test_utils .ec2 import (
1921 ec2_performance_upload_result_to_s3_and_validate ,
2022 post_process_inference ,
23+ get_ec2_instance_type ,
2124)
2225
26+
27+ LOGGER = logging .getLogger (__name__ )
28+ LOGGER .setLevel (logging .INFO )
29+ LOGGER .addHandler (logging .StreamHandler (sys .stderr ))
30+
31+
2332PT_PERFORMANCE_INFERENCE_SCRIPT = os .path .join (
2433 CONTAINER_TESTS_PREFIX , "benchmark" , "run_pytorch_inference_performance.py"
2534)
26- PT_PERFORMANCE_INFERENCE_CPU_CMD = f"{ PT_PERFORMANCE_INFERENCE_SCRIPT } --iterations 500"
35+ PT_PERFORMANCE_INFERENCE_CPU_CMD = f"{ PT_PERFORMANCE_INFERENCE_SCRIPT } --iterations 500 "
2736PT_PERFORMANCE_INFERENCE_GPU_CMD = f"{ PT_PERFORMANCE_INFERENCE_SCRIPT } --iterations 1000 --gpu"
37+ # Use the original p3.16xlarge instance, consider if use single gpu instance like g4dn.4xlarge, g5.4xlarge
38+ PT_EC2_GPU_INSTANCE_TYPE = ["p3.16xlarge" ]
39+ PT_EC2_CPU_INSTANCE_TYPE = get_ec2_instance_type (default = "c5.18xlarge" , processor = "cpu" )
40+ # c6g.4xlarge c6g.8xlarge reaches the 100% cpu usage for the benchmark when run VGG13 model
41+ PT_EC2_CPU_ARM64_INSTANCE_TYPES = ["c7g.4xlarge" , "c8g.4xlarge" , "m7g.4xlarge" , "r8g.4xlarge" ]
42+ PT_EC2_GPU_ARM64_INSTANCE_TYPE = get_ec2_instance_type (
43+ default = "g5g.4xlarge" , processor = "gpu" , arch_type = "arm64"
44+ )
2845
2946
30- @pytest .mark .model ("resnet18, VGG13, MobileNetV2, GoogleNet, DenseNet121, InceptionV3" )
31- @pytest .mark .parametrize ("ec2_instance_type" , ["p3.16xlarge" ], indirect = True )
47+ @pytest .mark .model (
48+ "VGG13, MobileNet_V2, GoogLeNet, DenseNet121, Inception_V3, ResNet18, ResNet50, ViT_B_16, Bert_128, Bert_256, Roberta_128, Roberta_256, DistilBert_128, DistilBert_256, All-MPNet_128, All-MPNet_256, ASR"
49+ )
50+ @pytest .mark .parametrize ("ec2_instance_type" , PT_EC2_GPU_INSTANCE_TYPE , indirect = True )
3251@pytest .mark .team ("conda" )
33- def test_performance_ec2_pytorch_inference_gpu (pytorch_inference , ec2_connection , region , gpu_only ):
52+ def test_performance_ec2_pytorch_inference_gpu (
53+ pytorch_inference , ec2_connection , region , gpu_only , ec2_instance_type
54+ ):
3455 _ , framework_version = get_framework_and_version_from_tag (pytorch_inference )
3556 threshold = get_threshold_for_image (framework_version , PYTORCH_INFERENCE_GPU_THRESHOLD )
3657 ec2_performance_pytorch_inference (
3758 pytorch_inference ,
3859 "gpu" ,
60+ ec2_instance_type ,
3961 ec2_connection ,
4062 region ,
4163 PT_PERFORMANCE_INFERENCE_GPU_CMD ,
4264 threshold ,
4365 )
4466
4567
46- @pytest .mark .model ("resnet18, VGG13, MobileNetV2, GoogleNet, DenseNet121, InceptionV3" )
47- @pytest .mark .parametrize ("ec2_instance_type" , ["c5.18xlarge" ], indirect = True )
68+ @pytest .mark .model (
69+ "ResNet18, MobileNet_V2, GoogLeNet, DenseNet121, Inception_V3, Bert_128, Roberta_128, DistilBert_128, All-MPNet_128, ASR"
70+ )
71+ @pytest .mark .parametrize ("ec2_instance_type" , PT_EC2_CPU_INSTANCE_TYPE , indirect = True )
4872@pytest .mark .team ("conda" )
49- def test_performance_ec2_pytorch_inference_cpu (pytorch_inference , ec2_connection , region , cpu_only ):
73+ def test_performance_ec2_pytorch_inference_cpu (
74+ pytorch_inference , ec2_connection , region , cpu_only , ec2_instance_type
75+ ):
5076 _ , framework_version = get_framework_and_version_from_tag (pytorch_inference )
5177 threshold = get_threshold_for_image (framework_version , PYTORCH_INFERENCE_CPU_THRESHOLD )
5278 ec2_performance_pytorch_inference (
5379 pytorch_inference ,
5480 "cpu" ,
81+ ec2_instance_type ,
82+ ec2_connection ,
83+ region ,
84+ PT_PERFORMANCE_INFERENCE_CPU_CMD ,
85+ threshold ,
86+ )
87+
88+
89+ @pytest .mark .model (
90+ "VGG13, MobileNet_V2, GoogLeNet, DenseNet121, Inception_V3, ResNet18, ResNet50, ViT_B_16, Bert_128, Bert_256, Roberta_128, Roberta_256, DistilBert_128, DistilBert_256, All-MPNet_128, All-MPNet_256, ASR"
91+ )
92+ @pytest .mark .parametrize ("ec2_instance_type" , PT_EC2_GPU_ARM64_INSTANCE_TYPE , indirect = True )
93+ @pytest .mark .parametrize ("ec2_instance_ami" , [UL22_BASE_ARM64_DLAMI_US_WEST_2 ], indirect = True )
94+ @pytest .mark .team ("conda" )
95+ def test_performance_ec2_pytorch_inference_arm64_gpu (
96+ pytorch_inference_arm64 , ec2_connection , region , gpu_only , ec2_instance_type
97+ ):
98+ _ , framework_version = get_framework_and_version_from_tag (pytorch_inference_arm64 )
99+ threshold = get_threshold_for_image (framework_version , PYTORCH_INFERENCE_GPU_THRESHOLD )
100+ if "arm64" not in pytorch_inference_arm64 :
101+ pytest .skip ("skip benchmark tests for non-arm64 images" )
102+ ec2_performance_pytorch_inference (
103+ pytorch_inference_arm64 ,
104+ "gpu" ,
105+ ec2_instance_type ,
106+ ec2_connection ,
107+ region ,
108+ PT_PERFORMANCE_INFERENCE_GPU_CMD ,
109+ threshold ,
110+ )
111+
112+
113+ @pytest .mark .model (
114+ "ResNet18, MobileNet_V2, GoogLeNet, DenseNet121, Inception_V3, Bert_128, Roberta_128, DistilBert_128, All-MPNet_128, ASR"
115+ )
116+ @pytest .mark .parametrize ("ec2_instance_type" , PT_EC2_CPU_ARM64_INSTANCE_TYPES , indirect = True )
117+ @pytest .mark .parametrize ("ec2_instance_ami" , [UL22_BASE_ARM64_DLAMI_US_WEST_2 ], indirect = True )
118+ @pytest .mark .team ("conda" )
119+ def test_performance_ec2_pytorch_inference_arm64_cpu (
120+ pytorch_inference_arm64 , ec2_connection , region , cpu_only , ec2_instance_type
121+ ):
122+ _ , framework_version = get_framework_and_version_from_tag (pytorch_inference_arm64 )
123+ threshold = get_threshold_for_image (framework_version , PYTORCH_INFERENCE_CPU_THRESHOLD )
124+ if "arm64" not in pytorch_inference_arm64 :
125+ pytest .skip ("skip benchmark tests for non-arm64 images" )
126+
127+ ec2_performance_pytorch_inference (
128+ pytorch_inference_arm64 ,
129+ "cpu" ,
130+ ec2_instance_type ,
55131 ec2_connection ,
56132 region ,
57133 PT_PERFORMANCE_INFERENCE_CPU_CMD ,
@@ -60,7 +136,7 @@ def test_performance_ec2_pytorch_inference_cpu(pytorch_inference, ec2_connection
60136
61137
62138def ec2_performance_pytorch_inference (
63- image_uri , processor , ec2_connection , region , test_cmd , threshold
139+ image_uri , processor , ec2_instance_type , ec2_connection , region , test_cmd , threshold
64140):
65141 docker_runtime = "--runtime=nvidia --gpus all" if processor == "gpu" else ""
66142 container_test_local_dir = os .path .join ("$HOME" , "container_tests" )
@@ -75,16 +151,52 @@ def ec2_performance_pytorch_inference(
75151 time_str = time .strftime ("%Y-%m-%d-%H-%M-%S" )
76152 commit_info = os .getenv ("CODEBUILD_RESOLVED_SOURCE_VERSION" )
77153 # Run performance inference command, display benchmark results to console
154+
78155 container_name = f"{ repo_name } -performance-{ image_tag } -ec2"
79156 log_file = f"synthetic_{ commit_info } _{ time_str } .log"
80- ec2_connection .run (
81- f"docker run { docker_runtime } -d --name { container_name } -e OMP_NUM_THREADS=1 "
82- f"-v { container_test_local_dir } :{ os .path .join (os .sep , 'test' )} { image_uri } "
83- )
84- ec2_connection .run (
85- f"docker exec { container_name } " f"python { test_cmd } " f"2>&1 | tee { log_file } "
86- )
87- ec2_connection .run (f"docker rm -f { container_name } " )
157+
158+ try :
159+ ec2_connection .run (
160+ f"docker run { docker_runtime } -d --name { container_name } -e OMP_NUM_THREADS=1 "
161+ f"-v { container_test_local_dir } :{ os .path .join (os .sep , 'test' )} { image_uri } "
162+ )
163+
164+ except Exception as e :
165+ LOGGER .info (f"Failed to start container: { e } " )
166+ return
167+
168+ try :
169+ ec2_connection .run (f"docker exec { container_name } pip install transformers" , warn = True )
170+
171+ except Exception as e :
172+ LOGGER .info (f"Failed to install transformers: { e } " )
173+ return
174+
175+ try :
176+ LOGGER .info (f"Starting benchmark test on { processor } { ec2_instance_type } instance..." )
177+ result = ec2_connection .run (
178+ f"docker exec { container_name } python { test_cmd } --instance { ec2_instance_type } 2>&1 | tee { log_file } " ,
179+ timeout = 3600 ,
180+ warn = True ,
181+ )
182+
183+ # Check if the command was successful
184+ if result .failed :
185+ LOGGER .info (f"Command failed with exit code { result .return_code } " )
186+ LOGGER .info (f"Error output:\n { result .stderr } " )
187+ else :
188+ LOGGER .info ("Command completed successfully" )
189+ sys .stdout .flush ()
190+
191+ except Exception as e :
192+ LOGGER .info (f"An error occurred during test execution: { e } " )
193+
194+ finally :
195+ # This block will run regardless of whether an exception occurred
196+ LOGGER .info (f"Cleaning { processor } { ec2_instance_type } up..." )
197+
198+ ec2_connection .run (f"docker rm -f { container_name } " )
199+
88200 ec2_performance_upload_result_to_s3_and_validate (
89201 ec2_connection ,
90202 image_uri ,
@@ -93,4 +205,5 @@ def ec2_performance_pytorch_inference(
93205 threshold ,
94206 post_process_inference ,
95207 log_file ,
208+ ec2_instance_type ,
96209 )
0 commit comments