Skip to content

Commit 568e6e2

Browse files
committed
add skipdict
1 parent c200c01 commit 568e6e2

File tree

2 files changed

+137
-3
lines changed

2 files changed

+137
-3
lines changed

test/dlc_tests/conftest.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@
5353
# ECR repo name fixtures
5454
# PyTorch
5555
"pytorch_training",
56+
"pytorch_training___2__6",
5657
"pytorch_training___2__5",
5758
"pytorch_training___2__4",
5859
"pytorch_training___2__3",
@@ -905,7 +906,7 @@ def skip_torchdata_test(request):
905906
if not image_uri:
906907
return
907908

908-
skip_dict = {">2.1.1": ["cpu", "cu118", "cu121"], ">=2.4": ["cpu", "cu124"]}
909+
skip_dict = {">2.1.1": ["cpu", "cu118", "cu121"], ">=2.4,<2.6": ["cpu", "cu124"]}
909910
if _validate_pytorch_framework_version(request, image_uri, "skip_torchdata_test", skip_dict):
910911
pytest.skip(
911912
f"Torchdata has paused development as of July 2023 and the latest compatible PyTorch version is 2.1.1."
@@ -924,7 +925,7 @@ def skip_smdebug_v1_test(request):
924925
else:
925926
return
926927

927-
skip_dict = {"==2.0.*": ["cu121"], ">=2.1,<2.4": ["cpu", "cu121"], ">=2.4": ["cpu", "cu124"]}
928+
skip_dict = {"==2.0.*": ["cu121"], ">=2.1,<2.4": ["cpu", "cu121"], ">=2.4": ["cpu", "cu124"], ">=2.6": ["cpu", "cu126"]}
928929
if _validate_pytorch_framework_version(request, image_uri, "skip_smdebug_v1_test", skip_dict):
929930
pytest.skip(f"SM Profiler v1 is on path for deprecation, skipping test")
930931

@@ -942,7 +943,7 @@ def skip_dgl_test(request):
942943
else:
943944
return
944945

945-
skip_dict = {"==2.0.*": ["cu121"], ">=2.1,<2.4": ["cpu", "cu121"], ">=2.4": ["cpu", "cu124"]}
946+
skip_dict = {"==2.0.*": ["cu121"], ">=2.1,<2.4": ["cpu", "cu121"], ">=2.4": ["cpu", "cu124"], ">=2.6": ["cpu", "cu126"]}
946947
if _validate_pytorch_framework_version(request, image_uri, "skip_dgl_test", skip_dict):
947948
pytest.skip(f"DGL binaries are removed, skipping test")
948949

@@ -1005,6 +1006,7 @@ def skip_serialized_release_pt_test(request):
10051006
"==1.13.*": ["cpu", "cu117"],
10061007
">=2.1,<2.4": ["cpu", "cu121"],
10071008
">=2.4,<2.6": ["cpu", "cu124"],
1009+
">=2.6,<2.7": ["cpu", "cu126"]
10081010
}
10091011
if _validate_pytorch_framework_version(
10101012
request, image_uri, "skip_serialized_release_pt_test", skip_dict
Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
import pytest
2+
3+
import test.test_utils as test_utils
4+
5+
from test.test_utils import ec2
6+
7+
from test.dlc_tests.ec2.pytorch.training import common_cases
8+
from test.dlc_tests.ec2 import smclarify_cases
9+
10+
11+
@pytest.mark.usefixtures("sagemaker")
12+
@pytest.mark.integration("pytorch_gpu_tests")
13+
@pytest.mark.model("N/A")
14+
@pytest.mark.team("conda")
15+
@pytest.mark.parametrize(
16+
"ec2_instance_type, region", common_cases.PT_EC2_GPU_INSTANCE_TYPE_AND_REGION, indirect=True
17+
)
18+
def test_pytorch_2_6_gpu(
19+
pytorch_training___2__6, ec2_connection, region, gpu_only, ec2_instance_type
20+
):
21+
pytorch_training = pytorch_training___2__6
22+
if test_utils.is_image_incompatible_with_instance_type(pytorch_training, ec2_instance_type):
23+
pytest.skip(
24+
f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}"
25+
)
26+
27+
test_cases = [
28+
(common_cases.pytorch_standalone, (pytorch_training, ec2_connection)),
29+
(common_cases.pytorch_training_mnist, (pytorch_training, ec2_connection)),
30+
(common_cases.pytorch_linear_regression_gpu, (pytorch_training, ec2_connection)),
31+
(common_cases.pytorch_gloo, (pytorch_training, ec2_connection)),
32+
(common_cases.pytorch_nccl, (pytorch_training, ec2_connection)),
33+
(common_cases.pytorch_mpi, (pytorch_training, ec2_connection)),
34+
(common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)),
35+
(common_cases.pytorch_cudnn_match_gpu, (pytorch_training, ec2_connection, region)),
36+
(common_cases.pytorch_curand_gpu, (pytorch_training, ec2_connection)),
37+
]
38+
39+
if "sagemaker" in pytorch_training:
40+
test_cases.append(
41+
(smclarify_cases.smclarify_metrics_gpu, (pytorch_training, ec2_connection)),
42+
)
43+
44+
# AMP must be run on multi_gpu
45+
if ec2.is_instance_multi_gpu(ec2_instance_type):
46+
test_cases.append((common_cases.pytorch_amp, (pytorch_training, ec2_connection)))
47+
48+
test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.6 GPU")
49+
50+
51+
@pytest.mark.usefixtures("sagemaker")
52+
@pytest.mark.integration("pytorch_gpu_heavy_tests")
53+
@pytest.mark.model("N/A")
54+
@pytest.mark.team("conda")
55+
@pytest.mark.parametrize(
56+
"ec2_instance_type, region",
57+
common_cases.PT_EC2_HEAVY_GPU_INSTANCE_TYPE_AND_REGION,
58+
indirect=True,
59+
)
60+
@pytest.mark.skipif(
61+
test_utils.is_pr_context() and not ec2.are_heavy_instance_ec2_tests_enabled(),
62+
reason="Skip GPU Heavy tests in PR context unless explicitly enabled",
63+
)
64+
def test_pytorch_2_6_gpu_heavy(
65+
pytorch_training___2__6, ec2_connection, region, gpu_only, ec2_instance_type
66+
):
67+
pytorch_training = pytorch_training___2__6
68+
if test_utils.is_image_incompatible_with_instance_type(pytorch_training, ec2_instance_type):
69+
pytest.skip(
70+
f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}"
71+
)
72+
73+
test_cases = [
74+
(common_cases.pytorch_gdrcopy, (pytorch_training, ec2_connection)),
75+
(common_cases.pytorch_transformer_engine, (pytorch_training, ec2_connection)),
76+
]
77+
78+
test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.6 GPU Heavy")
79+
80+
81+
@pytest.mark.usefixtures("sagemaker")
82+
@pytest.mark.integration("inductor")
83+
@pytest.mark.model("N/A")
84+
@pytest.mark.team("training-compiler")
85+
@pytest.mark.parametrize(
86+
"ec2_instance_type, region",
87+
common_cases.PT_EC2_GPU_INDUCTOR_INSTANCE_TYPE_AND_REGION,
88+
indirect=True,
89+
)
90+
def test_pytorch_2_6_gpu_inductor(
91+
pytorch_training___2__6, ec2_connection, region, gpu_only, ec2_instance_type
92+
):
93+
pytorch_training = pytorch_training___2__6
94+
if test_utils.is_image_incompatible_with_instance_type(pytorch_training, ec2_instance_type):
95+
pytest.skip(
96+
f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}"
97+
)
98+
99+
test_cases = [
100+
(common_cases.pytorch_gloo_inductor_gpu, (pytorch_training, ec2_connection)),
101+
(common_cases.pytorch_mpi_inductor_gpu, (pytorch_training, ec2_connection)),
102+
(common_cases.pytorch_nccl_inductor, (pytorch_training, ec2_connection)),
103+
(common_cases.pytorch_amp_inductor, (pytorch_training, ec2_connection)),
104+
]
105+
106+
test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.6 GPU Inductor")
107+
108+
109+
@pytest.mark.usefixtures("sagemaker")
110+
@pytest.mark.integration("pytorch_cpu_tests")
111+
@pytest.mark.model("N/A")
112+
@pytest.mark.team("conda")
113+
@pytest.mark.parametrize("ec2_instance_type", common_cases.PT_EC2_CPU_INSTANCE_TYPE, indirect=True)
114+
def test_pytorch_2_6_cpu(pytorch_training___2__6, ec2_connection, cpu_only):
115+
pytorch_training = pytorch_training___2__6
116+
117+
test_cases = [
118+
(common_cases.pytorch_standalone, (pytorch_training, ec2_connection)),
119+
(common_cases.pytorch_training_mnist, (pytorch_training, ec2_connection)),
120+
(common_cases.pytorch_linear_regression_cpu, (pytorch_training, ec2_connection)),
121+
(common_cases.pytorch_gloo, (pytorch_training, ec2_connection)),
122+
(common_cases.pytorch_mpi, (pytorch_training, ec2_connection)),
123+
(common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)),
124+
(common_cases.pytorch_telemetry_cpu, (pytorch_training, ec2_connection)),
125+
]
126+
127+
if "sagemaker" in pytorch_training:
128+
test_cases += [
129+
(smclarify_cases.smclarify_metrics_cpu, (pytorch_training, ec2_connection)),
130+
]
131+
132+
test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.6 CPU")

0 commit comments

Comments
 (0)