Skip to content

Commit fc1d89a

Browse files
Yadan-WeiYadan Wei
andauthored
Benchmark ARM64 PyTorch Inference (#4620)
* modify benchmark test * fix format * no build * comment out UL20_PT_NEURON_US_WEST_2 * test connection * fix connection error * use pkey * stash * test connection again * print version * fix username * run cpu tests too * run cpu tests add log * run cpu tests add log * add error catch * remove VGG13 model for cpu * remove VGG13 model for cpu * use a bigger c6g * remove c6g * stash * add more models and threshold * fix command * print log * add timeout * fix logger * distilbert * check distillbert key and decrease timeout * add strace and adjust bert model order * remove strace * remove strace * add instance type and adjust threshold * fix bug * add more instance and print instance in result * split model to cpu and gpu * use end time - start time * change unit and remove some heavy models * change units * build 2.6 and test benchmark * build PT2.5 x86 test * revert toml * revert buildspec * revert buildspec * change instance type --------- Co-authored-by: Yadan Wei <yadanwei@amazon.com>
1 parent 1d3d383 commit fc1d89a

File tree

6 files changed

+289
-96
lines changed

6 files changed

+289
-96
lines changed

src/benchmark_metrics.py

Lines changed: 27 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -79,22 +79,37 @@
7979
# p99 latency, unit: millisecond
8080
PYTORCH_INFERENCE_CPU_THRESHOLD = {
8181
">=1.0": {
82-
"ResNet18": 0.08,
83-
"VGG13": 0.45,
84-
"MobileNetV2": 0.06,
85-
"GoogleNet": 0.12,
86-
"DenseNet121": 0.15,
87-
"InceptionV3": 0.25,
82+
"ResNet18": 80.0,
83+
"MobileNet_V2": 60.0,
84+
"GoogLeNet": 120.0,
85+
"DenseNet121": 200.0,
86+
"Inception_V3": 250.0,
87+
"DistilBert_128": 200.0,
88+
"Bert_128": 300.0,
89+
"Roberta_128": 300.0,
90+
"ASR": 300.0,
91+
"All-MPNet_128": 300.0,
8892
}
8993
}
9094
PYTORCH_INFERENCE_GPU_THRESHOLD = {
9195
">=1.0": {
92-
"ResNet18": 0.0075,
93-
"VGG13": 0.004,
94-
"MobileNetV2": 0.013,
95-
"GoogleNet": 0.018,
96-
"DenseNet121": 0.04,
97-
"InceptionV3": 0.03,
96+
"ResNet18": 7.5,
97+
"VGG13": 4.0,
98+
"MobileNet_V2": 13.0,
99+
"GoogLeNet": 18.0,
100+
"DenseNet121": 40.0,
101+
"Inception_V3": 30.0,
102+
"ResNet50": 15.0,
103+
"ViT_B_16": 20.0,
104+
"DistilBert_128": 10.0,
105+
"DistilBert_256": 11.0,
106+
"Bert_128": 20.0,
107+
"Bert_256": 20.0,
108+
"Roberta_128": 20.0,
109+
"Roberta_256": 20.0,
110+
"ASR": 20.0,
111+
"All-MPNet_128": 20.0,
112+
"All-MPNet_256": 30.0,
98113
}
99114
}
100115

Lines changed: 130 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import os
22
import time
33
import pytest
4+
import logging
5+
import sys
46

57
from src.benchmark_metrics import (
68
PYTORCH_INFERENCE_GPU_THRESHOLD,
@@ -10,48 +12,122 @@
1012
from test.test_utils import (
1113
CONTAINER_TESTS_PREFIX,
1214
get_framework_and_version_from_tag,
13-
UL20_CPU_ARM64_US_WEST_2,
15+
UL22_BASE_ARM64_DLAMI_US_WEST_2,
1416
login_to_ecr_registry,
1517
get_account_id_from_image_uri,
1618
LOGGER,
1719
)
1820
from test.test_utils.ec2 import (
1921
ec2_performance_upload_result_to_s3_and_validate,
2022
post_process_inference,
23+
get_ec2_instance_type,
2124
)
2225

26+
27+
LOGGER = logging.getLogger(__name__)
28+
LOGGER.setLevel(logging.INFO)
29+
LOGGER.addHandler(logging.StreamHandler(sys.stderr))
30+
31+
2332
PT_PERFORMANCE_INFERENCE_SCRIPT = os.path.join(
2433
CONTAINER_TESTS_PREFIX, "benchmark", "run_pytorch_inference_performance.py"
2534
)
26-
PT_PERFORMANCE_INFERENCE_CPU_CMD = f"{PT_PERFORMANCE_INFERENCE_SCRIPT} --iterations 500"
35+
PT_PERFORMANCE_INFERENCE_CPU_CMD = f"{PT_PERFORMANCE_INFERENCE_SCRIPT} --iterations 500 "
2736
PT_PERFORMANCE_INFERENCE_GPU_CMD = f"{PT_PERFORMANCE_INFERENCE_SCRIPT} --iterations 1000 --gpu"
37+
# Use the original p3.16xlarge instance, consider if use single gpu instance like g4dn.4xlarge, g5.4xlarge
38+
PT_EC2_GPU_INSTANCE_TYPE = ["p3.16xlarge"]
39+
PT_EC2_CPU_INSTANCE_TYPE = get_ec2_instance_type(default="c5.18xlarge", processor="cpu")
40+
# c6g.4xlarge c6g.8xlarge reaches the 100% cpu usage for the benchmark when run VGG13 model
41+
PT_EC2_CPU_ARM64_INSTANCE_TYPES = ["c7g.4xlarge", "c8g.4xlarge", "m7g.4xlarge", "r8g.4xlarge"]
42+
PT_EC2_GPU_ARM64_INSTANCE_TYPE = get_ec2_instance_type(
43+
default="g5g.4xlarge", processor="gpu", arch_type="arm64"
44+
)
2845

2946

30-
@pytest.mark.model("resnet18, VGG13, MobileNetV2, GoogleNet, DenseNet121, InceptionV3")
31-
@pytest.mark.parametrize("ec2_instance_type", ["p3.16xlarge"], indirect=True)
47+
@pytest.mark.model(
48+
"VGG13, MobileNet_V2, GoogLeNet, DenseNet121, Inception_V3, ResNet18, ResNet50, ViT_B_16, Bert_128, Bert_256, Roberta_128, Roberta_256, DistilBert_128, DistilBert_256, All-MPNet_128, All-MPNet_256, ASR"
49+
)
50+
@pytest.mark.parametrize("ec2_instance_type", PT_EC2_GPU_INSTANCE_TYPE, indirect=True)
3251
@pytest.mark.team("conda")
33-
def test_performance_ec2_pytorch_inference_gpu(pytorch_inference, ec2_connection, region, gpu_only):
52+
def test_performance_ec2_pytorch_inference_gpu(
53+
pytorch_inference, ec2_connection, region, gpu_only, ec2_instance_type
54+
):
3455
_, framework_version = get_framework_and_version_from_tag(pytorch_inference)
3556
threshold = get_threshold_for_image(framework_version, PYTORCH_INFERENCE_GPU_THRESHOLD)
3657
ec2_performance_pytorch_inference(
3758
pytorch_inference,
3859
"gpu",
60+
ec2_instance_type,
3961
ec2_connection,
4062
region,
4163
PT_PERFORMANCE_INFERENCE_GPU_CMD,
4264
threshold,
4365
)
4466

4567

46-
@pytest.mark.model("resnet18, VGG13, MobileNetV2, GoogleNet, DenseNet121, InceptionV3")
47-
@pytest.mark.parametrize("ec2_instance_type", ["c5.18xlarge"], indirect=True)
68+
@pytest.mark.model(
69+
"ResNet18, MobileNet_V2, GoogLeNet, DenseNet121, Inception_V3, Bert_128, Roberta_128, DistilBert_128, All-MPNet_128, ASR"
70+
)
71+
@pytest.mark.parametrize("ec2_instance_type", PT_EC2_CPU_INSTANCE_TYPE, indirect=True)
4872
@pytest.mark.team("conda")
49-
def test_performance_ec2_pytorch_inference_cpu(pytorch_inference, ec2_connection, region, cpu_only):
73+
def test_performance_ec2_pytorch_inference_cpu(
74+
pytorch_inference, ec2_connection, region, cpu_only, ec2_instance_type
75+
):
5076
_, framework_version = get_framework_and_version_from_tag(pytorch_inference)
5177
threshold = get_threshold_for_image(framework_version, PYTORCH_INFERENCE_CPU_THRESHOLD)
5278
ec2_performance_pytorch_inference(
5379
pytorch_inference,
5480
"cpu",
81+
ec2_instance_type,
82+
ec2_connection,
83+
region,
84+
PT_PERFORMANCE_INFERENCE_CPU_CMD,
85+
threshold,
86+
)
87+
88+
89+
@pytest.mark.model(
90+
"VGG13, MobileNet_V2, GoogLeNet, DenseNet121, Inception_V3, ResNet18, ResNet50, ViT_B_16, Bert_128, Bert_256, Roberta_128, Roberta_256, DistilBert_128, DistilBert_256, All-MPNet_128, All-MPNet_256, ASR"
91+
)
92+
@pytest.mark.parametrize("ec2_instance_type", PT_EC2_GPU_ARM64_INSTANCE_TYPE, indirect=True)
93+
@pytest.mark.parametrize("ec2_instance_ami", [UL22_BASE_ARM64_DLAMI_US_WEST_2], indirect=True)
94+
@pytest.mark.team("conda")
95+
def test_performance_ec2_pytorch_inference_arm64_gpu(
96+
pytorch_inference_arm64, ec2_connection, region, gpu_only, ec2_instance_type
97+
):
98+
_, framework_version = get_framework_and_version_from_tag(pytorch_inference_arm64)
99+
threshold = get_threshold_for_image(framework_version, PYTORCH_INFERENCE_GPU_THRESHOLD)
100+
if "arm64" not in pytorch_inference_arm64:
101+
pytest.skip("skip benchmark tests for non-arm64 images")
102+
ec2_performance_pytorch_inference(
103+
pytorch_inference_arm64,
104+
"gpu",
105+
ec2_instance_type,
106+
ec2_connection,
107+
region,
108+
PT_PERFORMANCE_INFERENCE_GPU_CMD,
109+
threshold,
110+
)
111+
112+
113+
@pytest.mark.model(
114+
"ResNet18, MobileNet_V2, GoogLeNet, DenseNet121, Inception_V3, Bert_128, Roberta_128, DistilBert_128, All-MPNet_128, ASR"
115+
)
116+
@pytest.mark.parametrize("ec2_instance_type", PT_EC2_CPU_ARM64_INSTANCE_TYPES, indirect=True)
117+
@pytest.mark.parametrize("ec2_instance_ami", [UL22_BASE_ARM64_DLAMI_US_WEST_2], indirect=True)
118+
@pytest.mark.team("conda")
119+
def test_performance_ec2_pytorch_inference_arm64_cpu(
120+
pytorch_inference_arm64, ec2_connection, region, cpu_only, ec2_instance_type
121+
):
122+
_, framework_version = get_framework_and_version_from_tag(pytorch_inference_arm64)
123+
threshold = get_threshold_for_image(framework_version, PYTORCH_INFERENCE_CPU_THRESHOLD)
124+
if "arm64" not in pytorch_inference_arm64:
125+
pytest.skip("skip benchmark tests for non-arm64 images")
126+
127+
ec2_performance_pytorch_inference(
128+
pytorch_inference_arm64,
129+
"cpu",
130+
ec2_instance_type,
55131
ec2_connection,
56132
region,
57133
PT_PERFORMANCE_INFERENCE_CPU_CMD,
@@ -60,7 +136,7 @@ def test_performance_ec2_pytorch_inference_cpu(pytorch_inference, ec2_connection
60136

61137

62138
def ec2_performance_pytorch_inference(
63-
image_uri, processor, ec2_connection, region, test_cmd, threshold
139+
image_uri, processor, ec2_instance_type, ec2_connection, region, test_cmd, threshold
64140
):
65141
docker_runtime = "--runtime=nvidia --gpus all" if processor == "gpu" else ""
66142
container_test_local_dir = os.path.join("$HOME", "container_tests")
@@ -75,16 +151,52 @@ def ec2_performance_pytorch_inference(
75151
time_str = time.strftime("%Y-%m-%d-%H-%M-%S")
76152
commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION")
77153
# Run performance inference command, display benchmark results to console
154+
78155
container_name = f"{repo_name}-performance-{image_tag}-ec2"
79156
log_file = f"synthetic_{commit_info}_{time_str}.log"
80-
ec2_connection.run(
81-
f"docker run {docker_runtime} -d --name {container_name} -e OMP_NUM_THREADS=1 "
82-
f"-v {container_test_local_dir}:{os.path.join(os.sep, 'test')} {image_uri} "
83-
)
84-
ec2_connection.run(
85-
f"docker exec {container_name} " f"python {test_cmd} " f"2>&1 | tee {log_file}"
86-
)
87-
ec2_connection.run(f"docker rm -f {container_name}")
157+
158+
try:
159+
ec2_connection.run(
160+
f"docker run {docker_runtime} -d --name {container_name} -e OMP_NUM_THREADS=1 "
161+
f"-v {container_test_local_dir}:{os.path.join(os.sep, 'test')} {image_uri}"
162+
)
163+
164+
except Exception as e:
165+
LOGGER.info(f"Failed to start container: {e}")
166+
return
167+
168+
try:
169+
ec2_connection.run(f"docker exec {container_name} pip install transformers", warn=True)
170+
171+
except Exception as e:
172+
LOGGER.info(f"Failed to install transformers: {e}")
173+
return
174+
175+
try:
176+
LOGGER.info(f"Starting benchmark test on {processor} {ec2_instance_type} instance...")
177+
result = ec2_connection.run(
178+
f"docker exec {container_name} python {test_cmd} --instance {ec2_instance_type} 2>&1 | tee {log_file}",
179+
timeout=3600,
180+
warn=True,
181+
)
182+
183+
# Check if the command was successful
184+
if result.failed:
185+
LOGGER.info(f"Command failed with exit code {result.return_code}")
186+
LOGGER.info(f"Error output:\n{result.stderr}")
187+
else:
188+
LOGGER.info("Command completed successfully")
189+
sys.stdout.flush()
190+
191+
except Exception as e:
192+
LOGGER.info(f"An error occurred during test execution: {e}")
193+
194+
finally:
195+
# This block will run regardless of whether an exception occurred
196+
LOGGER.info(f"Cleaning {processor} {ec2_instance_type} up...")
197+
198+
ec2_connection.run(f"docker rm -f {container_name}")
199+
88200
ec2_performance_upload_result_to_s3_and_validate(
89201
ec2_connection,
90202
image_uri,
@@ -93,4 +205,5 @@ def ec2_performance_pytorch_inference(
93205
threshold,
94206
post_process_inference,
95207
log_file,
208+
ec2_instance_type,
96209
)

test/dlc_tests/conftest.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -779,6 +779,7 @@ def ec2_connection(request, ec2_instance, ec2_key_name, ec2_instance_type, regio
779779
user = ec2_utils.get_instance_user(instance_id, region=region)
780780

781781
LOGGER.info(f"Connecting to {user}@{ip_address}")
782+
782783
conn = Connection(
783784
user=user,
784785
host=ip_address,

0 commit comments

Comments
 (0)