|
| 1 | +import pytest |
| 2 | + |
| 3 | +import test.test_utils as test_utils |
| 4 | + |
| 5 | +from test.test_utils import ec2 |
| 6 | + |
| 7 | +from test.dlc_tests.ec2.pytorch.training import common_cases |
| 8 | +from test.dlc_tests.ec2 import smclarify_cases |
| 9 | + |
| 10 | + |
| 11 | +@pytest.mark.usefixtures("sagemaker") |
| 12 | +@pytest.mark.integration("pytorch_gpu_tests") |
| 13 | +@pytest.mark.model("N/A") |
| 14 | +@pytest.mark.team("conda") |
| 15 | +@pytest.mark.parametrize( |
| 16 | + "ec2_instance_type, region", common_cases.PT_EC2_GPU_INSTANCE_TYPE_AND_REGION, indirect=True |
| 17 | +) |
| 18 | +def test_pytorch_2_6_gpu( |
| 19 | + pytorch_training___2__6, ec2_connection, region, gpu_only, ec2_instance_type |
| 20 | +): |
| 21 | + pytorch_training = pytorch_training___2__6 |
| 22 | + if test_utils.is_image_incompatible_with_instance_type(pytorch_training, ec2_instance_type): |
| 23 | + pytest.skip( |
| 24 | + f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}" |
| 25 | + ) |
| 26 | + |
| 27 | + test_cases = [ |
| 28 | + (common_cases.pytorch_standalone, (pytorch_training, ec2_connection)), |
| 29 | + (common_cases.pytorch_training_mnist, (pytorch_training, ec2_connection)), |
| 30 | + (common_cases.pytorch_linear_regression_gpu, (pytorch_training, ec2_connection)), |
| 31 | + (common_cases.pytorch_gloo, (pytorch_training, ec2_connection)), |
| 32 | + (common_cases.pytorch_nccl, (pytorch_training, ec2_connection)), |
| 33 | + (common_cases.pytorch_mpi, (pytorch_training, ec2_connection)), |
| 34 | + (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)), |
| 35 | + (common_cases.pytorch_cudnn_match_gpu, (pytorch_training, ec2_connection, region)), |
| 36 | + (common_cases.pytorch_curand_gpu, (pytorch_training, ec2_connection)), |
| 37 | + ] |
| 38 | + |
| 39 | + if "sagemaker" in pytorch_training: |
| 40 | + test_cases.append( |
| 41 | + (smclarify_cases.smclarify_metrics_gpu, (pytorch_training, ec2_connection)), |
| 42 | + ) |
| 43 | + |
| 44 | + # AMP must be run on multi_gpu |
| 45 | + if ec2.is_instance_multi_gpu(ec2_instance_type): |
| 46 | + test_cases.append((common_cases.pytorch_amp, (pytorch_training, ec2_connection))) |
| 47 | + |
| 48 | + test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.6 GPU") |
| 49 | + |
| 50 | + |
| 51 | +@pytest.mark.usefixtures("sagemaker") |
| 52 | +@pytest.mark.integration("pytorch_gpu_heavy_tests") |
| 53 | +@pytest.mark.model("N/A") |
| 54 | +@pytest.mark.team("conda") |
| 55 | +@pytest.mark.parametrize( |
| 56 | + "ec2_instance_type, region", |
| 57 | + common_cases.PT_EC2_HEAVY_GPU_INSTANCE_TYPE_AND_REGION, |
| 58 | + indirect=True, |
| 59 | +) |
| 60 | +@pytest.mark.skipif( |
| 61 | + test_utils.is_pr_context() and not ec2.are_heavy_instance_ec2_tests_enabled(), |
| 62 | + reason="Skip GPU Heavy tests in PR context unless explicitly enabled", |
| 63 | +) |
| 64 | +def test_pytorch_2_6_gpu_heavy( |
| 65 | + pytorch_training___2__6, ec2_connection, region, gpu_only, ec2_instance_type |
| 66 | +): |
| 67 | + pytorch_training = pytorch_training___2__6 |
| 68 | + if test_utils.is_image_incompatible_with_instance_type(pytorch_training, ec2_instance_type): |
| 69 | + pytest.skip( |
| 70 | + f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}" |
| 71 | + ) |
| 72 | + |
| 73 | + test_cases = [ |
| 74 | + (common_cases.pytorch_gdrcopy, (pytorch_training, ec2_connection)), |
| 75 | + (common_cases.pytorch_transformer_engine, (pytorch_training, ec2_connection)), |
| 76 | + ] |
| 77 | + |
| 78 | + test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.6 GPU Heavy") |
| 79 | + |
| 80 | + |
| 81 | +@pytest.mark.usefixtures("sagemaker") |
| 82 | +@pytest.mark.integration("inductor") |
| 83 | +@pytest.mark.model("N/A") |
| 84 | +@pytest.mark.team("training-compiler") |
| 85 | +@pytest.mark.parametrize( |
| 86 | + "ec2_instance_type, region", |
| 87 | + common_cases.PT_EC2_GPU_INDUCTOR_INSTANCE_TYPE_AND_REGION, |
| 88 | + indirect=True, |
| 89 | +) |
| 90 | +def test_pytorch_2_6_gpu_inductor( |
| 91 | + pytorch_training___2__6, ec2_connection, region, gpu_only, ec2_instance_type |
| 92 | +): |
| 93 | + pytorch_training = pytorch_training___2__6 |
| 94 | + if test_utils.is_image_incompatible_with_instance_type(pytorch_training, ec2_instance_type): |
| 95 | + pytest.skip( |
| 96 | + f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}" |
| 97 | + ) |
| 98 | + |
| 99 | + test_cases = [ |
| 100 | + (common_cases.pytorch_gloo_inductor_gpu, (pytorch_training, ec2_connection)), |
| 101 | + (common_cases.pytorch_mpi_inductor_gpu, (pytorch_training, ec2_connection)), |
| 102 | + (common_cases.pytorch_nccl_inductor, (pytorch_training, ec2_connection)), |
| 103 | + (common_cases.pytorch_amp_inductor, (pytorch_training, ec2_connection)), |
| 104 | + ] |
| 105 | + |
| 106 | + test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.6 GPU Inductor") |
| 107 | + |
| 108 | + |
| 109 | +@pytest.mark.usefixtures("sagemaker") |
| 110 | +@pytest.mark.integration("pytorch_cpu_tests") |
| 111 | +@pytest.mark.model("N/A") |
| 112 | +@pytest.mark.team("conda") |
| 113 | +@pytest.mark.parametrize("ec2_instance_type", common_cases.PT_EC2_CPU_INSTANCE_TYPE, indirect=True) |
| 114 | +def test_pytorch_2_6_cpu(pytorch_training___2__6, ec2_connection, cpu_only): |
| 115 | + pytorch_training = pytorch_training___2__6 |
| 116 | + |
| 117 | + test_cases = [ |
| 118 | + (common_cases.pytorch_standalone, (pytorch_training, ec2_connection)), |
| 119 | + (common_cases.pytorch_training_mnist, (pytorch_training, ec2_connection)), |
| 120 | + (common_cases.pytorch_linear_regression_cpu, (pytorch_training, ec2_connection)), |
| 121 | + (common_cases.pytorch_gloo, (pytorch_training, ec2_connection)), |
| 122 | + (common_cases.pytorch_mpi, (pytorch_training, ec2_connection)), |
| 123 | + (common_cases.pytorch_training_torchaudio, (pytorch_training, ec2_connection)), |
| 124 | + (common_cases.pytorch_telemetry_cpu, (pytorch_training, ec2_connection)), |
| 125 | + ] |
| 126 | + |
| 127 | + if "sagemaker" in pytorch_training: |
| 128 | + test_cases += [ |
| 129 | + (smclarify_cases.smclarify_metrics_cpu, (pytorch_training, ec2_connection)), |
| 130 | + ] |
| 131 | + |
| 132 | + test_utils.execute_serial_test_cases(test_cases, test_description="PT 2.6 CPU") |
0 commit comments