|
1 | 1 | #!/bin/bash
|
2 |
| -set -xue |
| 2 | +set -exo pipefail |
| 3 | +CDIR="$(cd "$(dirname "$0")"/../ ; pwd -P)" |
| 4 | +LOGFILE=/tmp/pytorch_py_test.log |
| 5 | +MAX_GRAPH_SIZE=500 |
| 6 | +GRAPH_CHECK_FREQUENCY=100 |
| 7 | +VERBOSITY=2 |
3 | 8 |
|
4 |
| -python3 test/neuron/test_neuron_utils.py |
| 9 | +# Note [Keep Going] |
| 10 | +# |
| 11 | +# Set the `CONTINUE_ON_ERROR` flag to `true` to make the CI tests continue on error. |
| 12 | +# This will allow you to see all the failures on your PR, not stopping with the first |
| 13 | +# test failure like the default behavior. |
| 14 | +CONTINUE_ON_ERROR="${CONTINUE_ON_ERROR:-0}" |
| 15 | +if [[ "$CONTINUE_ON_ERROR" == "1" ]]; then |
| 16 | + set +e |
| 17 | +fi |
| 18 | + |
| 19 | +while getopts 'LM:C:V:' OPTION |
| 20 | +do |
| 21 | + case $OPTION in |
| 22 | + L) |
| 23 | + LOGFILE= |
| 24 | + ;; |
| 25 | + M) |
| 26 | + MAX_GRAPH_SIZE=$OPTARG |
| 27 | + ;; |
| 28 | + C) |
| 29 | + GRAPH_CHECK_FREQUENCY=$OPTARG |
| 30 | + ;; |
| 31 | + V) |
| 32 | + VERBOSITY=$OPTARG |
| 33 | + ;; |
| 34 | + esac |
| 35 | +done |
| 36 | +shift $(($OPTIND - 1)) |
| 37 | + |
| 38 | +export TRIM_GRAPH_SIZE=$MAX_GRAPH_SIZE |
| 39 | +export TRIM_GRAPH_CHECK_FREQUENCY=$GRAPH_CHECK_FREQUENCY |
| 40 | +export TORCH_TEST_DEVICES="$CDIR/pytorch_test_base.py" |
| 41 | +export PYTORCH_TEST_WITH_SLOW=1 |
| 42 | +export XLA_DUMP_FATAL_STACK=1 |
| 43 | +export CPU_NUM_DEVICES=4 |
| 44 | + |
| 45 | +TORCH_XLA_DIR=$(cd ~; dirname "$(python -c 'import torch_xla; print(torch_xla.__file__)')") |
| 46 | +COVERAGE_FILE="$CDIR/../.coverage" |
| 47 | + |
| 48 | +function run_coverage { |
| 49 | + if [ "${USE_COVERAGE:-0}" != "0" ]; then |
| 50 | + coverage run --source="$TORCH_XLA_DIR" -p "$@" |
| 51 | + else |
| 52 | + python3 "$@" |
| 53 | + fi |
| 54 | +} |
| 55 | + |
| 56 | +function run_test { |
| 57 | + echo "Running in PjRt runtime: $@" |
| 58 | + PJRT_DEVICE=NEURON NEURON_NUM_DEVICES=1 run_coverage "$@" |
| 59 | +} |
| 60 | + |
| 61 | +function run_test_without_functionalization { |
| 62 | + echo "Running with XLA_DISABLE_FUNCTIONALIZATION: $@" |
| 63 | + XLA_DISABLE_FUNCTIONALIZATION=1 run_test "$@" |
| 64 | +} |
| 65 | + |
| 66 | +function run_xla_ir_debug { |
| 67 | + echo "Running with XLA_IR_DEBUG: $@" |
| 68 | + XLA_IR_DEBUG=1 run_test "$@" |
| 69 | +} |
| 70 | + |
| 71 | +function run_use_bf16 { |
| 72 | + echo "Running with XLA_USE_BF16: $@" |
| 73 | + XLA_USE_BF16=1 run_test "$@" |
| 74 | +} |
| 75 | + |
| 76 | +function run_downcast_bf16 { |
| 77 | + echo "Running with XLA_DOWNCAST_BF16: $@" |
| 78 | + XLA_DOWNCAST_BF16=1 run_test "$@" |
| 79 | +} |
| 80 | + |
| 81 | +function run_xla_hlo_debug { |
| 82 | + echo "Running with XLA_IR_DEBUG: $@" |
| 83 | + XLA_HLO_DEBUG=1 run_test "$@" |
| 84 | +} |
| 85 | + |
| 86 | +function run_dynamic { |
| 87 | + echo "Running in DynamicShape mode: $@" |
| 88 | + XLA_EXPERIMENTAL="nonzero:masked_select:masked_scatter:nms" run_test "$@" |
| 89 | +} |
| 90 | + |
| 91 | +function run_eager_debug { |
| 92 | + echo "Running in Eager Debug mode: $@" |
| 93 | + XLA_USE_EAGER_DEBUG_MODE=1 run_test "$@" |
| 94 | +} |
| 95 | + |
| 96 | +function run_save_tensor_ir { |
| 97 | + echo "Running in save tensor file mode: $@" |
| 98 | + XLA_SAVE_TENSORS_FILE="/tmp/xla_test_save_ir.txt" XLA_SAVE_TENSORS_FMT="text" run_test "$@" |
| 99 | +} |
| 100 | + |
| 101 | +function run_save_tensor_hlo { |
| 102 | + echo "Running in save tensor file mode: $@" |
| 103 | + XLA_SAVE_TENSORS_FILE="/tmp/xla_test_save_ir.txt" XLA_SAVE_TENSORS_FMT="hlo" run_test "$@" |
| 104 | +} |
| 105 | + |
| 106 | +function run_pt_xla_debug { |
| 107 | + echo "Running in save tensor file mode: $@" |
| 108 | + PT_XLA_DEBUG=1 PT_XLA_DEBUG_FILE="/tmp/pt_xla_debug.txt" run_test "$@" |
| 109 | +} |
| 110 | + |
| 111 | +function run_pt_xla_debug_level1 { |
| 112 | + echo "Running in save tensor file mode: $@" |
| 113 | + PT_XLA_DEBUG_LEVEL=1 PT_XLA_DEBUG_FILE="/tmp/pt_xla_debug.txt" run_test "$@" |
| 114 | +} |
| 115 | + |
| 116 | +function run_torchrun { |
| 117 | + PJRT_DEVICE=NEURON torchrun --nnodes 1 --nproc-per-node 2 $@ |
| 118 | +} |
| 119 | + |
| 120 | +function run_torch_op_tests { |
| 121 | + run_dynamic "$CDIR/../../test/test_view_ops.py" "$@" -v TestViewOpsXLA |
| 122 | + run_test_without_functionalization "$CDIR/../../test/test_view_ops.py" "$@" -v TestViewOpsXLA |
| 123 | + run_test "$CDIR/../../test/test_torch.py" "$@" -v TestTorchDeviceTypeXLA |
| 124 | + run_dynamic "$CDIR/../../test/test_torch.py" "$@" -v TestDevicePrecisionXLA |
| 125 | + run_test "$CDIR/../../test/test_torch.py" "$@" -v TestTensorDeviceOpsXLA |
| 126 | + run_test "$CDIR/../../test/test_indexing.py" "$@" -v TestIndexingXLA |
| 127 | + run_test "$CDIR/../../test/test_indexing.py" "$@" -v NumpyTestsXLA |
| 128 | + # run_dynamic "$CDIR/../../test/test_nn.py" "$@" -v TestNNDeviceTypeXLA |
| 129 | + run_dynamic "$CDIR/../../test/nn/test_dropout.py" "$@" -v TestDropoutNNDeviceTypeXLA |
| 130 | + run_dynamic "$CDIR/../../test/nn/test_pooling.py" "$@" -v TestPoolingNNDeviceTypeXLA |
| 131 | + run_dynamic "$CDIR/../../test/nn/test_embedding.py" "$@" -v TestEmbeddingNNDeviceTypeXLA |
| 132 | + run_dynamic "$CDIR/../../test/nn/test_convolution.py" "$@" -v TestConvolutionNNDeviceTypeXLA |
| 133 | + run_dynamic "$CDIR/../../test/nn/test_multihead_attention.py" "$@" -v TestMultiheadAttentionNNDeviceTypeXLA |
| 134 | + run_dynamic "$CDIR/../../test/test_type_promotion.py" "$@" -v TestTypePromotionXLA |
| 135 | +} |
| 136 | + |
| 137 | +####################################################################################### |
| 138 | +################################# XLA OP TESTS SHARDS ################################# |
| 139 | +####################################################################################### |
| 140 | + |
| 141 | +# DO NOT MODIFY |
| 142 | +function run_xla_op_tests1 { |
| 143 | + #run_dynamic "$CDIR/test_operations.py" "$@" --verbosity=$VERBOSITY |
| 144 | + #run_dynamic "$CDIR/ds/test_dynamic_shapes.py" |
| 145 | + #run_dynamic "$CDIR/ds/test_dynamic_shape_models.py" "$@" --verbosity=$VERBOSITY |
| 146 | + #run_eager_debug "$CDIR/test_operations.py" "$@" --verbosity=$VERBOSITY |
| 147 | + #run_test "$CDIR/test_operations.py" "$@" --verbosity=$VERBOSITY |
| 148 | + #run_test_without_functionalization "$CDIR/test_operations.py" "$@" --verbosity=$VERBOSITY |
| 149 | + run_pt_xla_debug "$CDIR/debug_tool/test_pt_xla_debug.py" |
| 150 | + run_pt_xla_debug_level1 "$CDIR/debug_tool/test_pt_xla_debug.py" |
| 151 | + run_test "$CDIR/test_async_closures.py" |
| 152 | + run_test "$CDIR/test_hlo_metadata.py" |
| 153 | + #run_test "$CDIR/test_profiler.py" |
| 154 | + run_test "$CDIR/pjrt/test_runtime.py" |
| 155 | + #NEURONCORE_NUM_DEVICES=2 python "$CDIR/pjrt/test_ddp.py" |
| 156 | + run_test "$CDIR/pjrt/test_mesh_service.py" |
| 157 | + #run_test "$CDIR/test_python_ops.py" |
| 158 | + #run_test "$CDIR/test_ops.py" |
| 159 | + run_test "$CDIR/test_metrics.py" |
| 160 | + run_test "$CDIR/test_deprecation.py" |
| 161 | + run_test "$CDIR/dynamo/test_dynamo_integrations_util.py" |
| 162 | + #run_test "$CDIR/dynamo/test_dynamo_aliasing.py" |
| 163 | + run_test "$CDIR/dynamo/test_dynamo.py" |
| 164 | + run_test "$CDIR/dynamo/test_dynamo_dynamic_shape.py" |
| 165 | + run_test "$CDIR/dynamo/test_bridge.py" |
| 166 | + run_test "$CDIR/dynamo/test_num_output.py" |
| 167 | + run_test "$CDIR/dynamo/test_graph_input_matcher.py" |
| 168 | + run_test "$CDIR/dynamo/test_dynamo_config.py" |
| 169 | + run_save_tensor_ir "$CDIR/dynamo/test_dynamo_graph_dump.py" |
| 170 | + #run_test "$CDIR/test_data_type.py" |
| 171 | + run_use_bf16 "$CDIR/test_data_type.py" |
| 172 | + run_downcast_bf16 "$CDIR/test_data_type.py" |
| 173 | + #run_test "$CDIR/test_fp8.py" |
| 174 | + run_xla_ir_debug "$CDIR/test_env_var_mapper.py" |
| 175 | + run_xla_hlo_debug "$CDIR/test_env_var_mapper.py" |
| 176 | + run_xla_hlo_debug "$CDIR/stablehlo/test_stablehlo_save_load.py" |
| 177 | + run_save_tensor_ir "$CDIR/spmd/test_spmd_graph_dump.py" |
| 178 | + run_save_tensor_hlo "$CDIR/spmd/test_spmd_graph_dump.py" |
| 179 | +} |
| 180 | + |
| 181 | +function run_xla_op_tests2 { |
| 182 | + run_test "$CDIR/pjrt/test_dtypes.py" |
| 183 | + #run_test "$CDIR/test_while_loop.py" |
| 184 | + run_test "$CDIR/test_scan.py" |
| 185 | + run_test "$CDIR/test_autocast.py" |
| 186 | + run_test "$CDIR/test_grad_checkpoint.py" |
| 187 | + #run_test "$CDIR/eager/test_eager.py" |
| 188 | + run_test "$CDIR/eager/test_eager_with_xla_compile.py" |
| 189 | + run_test "$CDIR/eager/test_eager_with_torch_compile.py" |
| 190 | + #run_test "$CDIR/eager/test_eager_all_reduce_in_place.py" |
| 191 | + run_test "$CDIR/eager/test_eager_spmd.py" |
| 192 | + run_test "$CDIR/test_callback.py" |
| 193 | + XLA_USE_SPMD=1 run_test "$CDIR/test_callback.py" |
| 194 | +} |
| 195 | + |
| 196 | +# All the new xla op tests should go to run_xla_op_tests3 |
| 197 | +function run_xla_op_tests3 { |
| 198 | + # TODO(qihqi): this test require tensorflow to run. need to setup separate |
| 199 | + # CI with tf. |
| 200 | + run_test "$CDIR/stablehlo/test_exports.py" |
| 201 | + run_test "$CDIR/stablehlo/test_export_fx_passes.py" |
| 202 | + run_test "$CDIR/stablehlo/test_implicit_broadcasting.py" |
| 203 | + run_test "$CDIR/stablehlo/test_composite.py" |
| 204 | + run_test "$CDIR/stablehlo/test_pt2e_qdq.py" |
| 205 | + run_test "$CDIR/stablehlo/test_stablehlo_custom_call.py" |
| 206 | + #run_xla_hlo_debug "$CDIR/stablehlo/test_stablehlo_inference.py" |
| 207 | + #=run_test "$CDIR/stablehlo/test_stablehlo_compile.py" |
| 208 | + run_test "$CDIR/stablehlo/test_unbounded_dynamism.py" |
| 209 | + #run_test "$CDIR/quantized_ops/test_quantized_matmul.py" |
| 210 | + #run_test "$CDIR/quantized_ops/test_dot_general.py" |
| 211 | + #run_test "$CDIR/spmd/test_xla_sharding.py" |
| 212 | + run_test "$CDIR/spmd/test_xla_sharding_hlo.py" |
| 213 | + #run_test "$CDIR/spmd/test_xla_virtual_device.py" |
| 214 | + #run_test "$CDIR/spmd/test_dynamo_spmd.py" |
| 215 | + run_test "$CDIR/spmd/test_spmd_debugging.py" |
| 216 | + #=run_test "$CDIR/spmd/test_xla_distributed_checkpoint.py" |
| 217 | + run_test "$CDIR/spmd/test_xla_spmd_python_api_interaction.py" |
| 218 | + #run_test "$CDIR/spmd/test_dtensor_integration.py" |
| 219 | + #run_test "$CDIR/spmd/test_dtensor_integration2.py" |
| 220 | + run_test "$CDIR/spmd/test_xla_auto_sharding.py" |
| 221 | + #run_test "$CDIR/spmd/test_spmd_parameter_wrapping.py" |
| 222 | + run_test "$CDIR/spmd/test_train_spmd_linear_model.py" |
| 223 | + run_test "$CDIR/spmd/test_xla_spmd_python_api_interaction.py" |
| 224 | + run_test "$CDIR/spmd/test_xla_auto_sharding.py" |
| 225 | + run_test "$CDIR/spmd/test_fsdp_v2.py" |
| 226 | + run_test "$CDIR/test_operations_hlo.py" "$@" --verbosity=$VERBOSITY |
| 227 | + run_test "$CDIR/test_input_output_aliases.py" |
| 228 | + run_test "$CDIR/test_torch_distributed_xla_backend.py" |
| 229 | + run_torchrun "$CDIR/pjrt/test_torchrun.py" |
| 230 | + run_test "$CDIR/test_persistent_cache.py" |
| 231 | + run_test "$CDIR/test_devices.py" |
| 232 | + |
| 233 | + run_test "$CDIR/neuron/test_neuron_utils.py" |
| 234 | + |
| 235 | + #python3 examples/data_parallel/train_resnet_xla_ddp.py # compiler error |
| 236 | + #python3 examples/fsdp/train_resnet_fsdp_auto_wrap.py |
| 237 | + #python3 examples/eager/train_decoder_only_eager.py # OOM |
| 238 | + #python3 examples/eager/train_decoder_only_eager_spmd_data_parallel.py # compiler err due to f64 |
| 239 | + PJRT_DEVICE=NEURON NEURONCORE_NUM_DEVICES=1 python3 examples/eager/train_decoder_only_eager_with_compile.py # nan loss expected? |
| 240 | + PJRT_DEVICE=NEURON NEURONCORE_NUM_DEVICES=1 python3 examples/eager/train_decoder_only_eager_multi_process.py |
| 241 | +} |
| 242 | + |
| 243 | +####################################################################################### |
| 244 | + |
| 245 | +function run_op_tests { |
| 246 | + run_torch_op_tests |
| 247 | + run_xla_op_tests1 |
| 248 | + run_xla_op_tests2 |
| 249 | + run_xla_op_tests3 |
| 250 | +} |
| 251 | + |
| 252 | +function run_mp_op_tests { |
| 253 | + run_test "$CDIR/test_mp_replication.py" |
| 254 | + #run_test "$CDIR/test_mp_all_to_all.py" |
| 255 | + run_test "$CDIR/test_mp_collective_permute.py" |
| 256 | + #run_test "$CDIR/test_mp_all_gather.py" # "wrong reductions"? |
| 257 | + run_test "$CDIR/test_mp_reduce_scatter.py" |
| 258 | + run_test "$CDIR/test_zero1.py" |
| 259 | + run_test "$CDIR/test_mp_distributed_mm.py" |
| 260 | + run_test "$CDIR/test_mp_save.py" |
| 261 | + run_test "$CDIR/test_mp_mesh_reduce.py" |
| 262 | + run_test "$CDIR/test_mp_sync_batch_norm.py" |
| 263 | + # TODO(JackCaoG): enable this |
| 264 | + run_test "$CDIR/dynamo/test_traceable_collectives.py" |
| 265 | + run_test "$CDIR/test_fsdp_auto_wrap.py" |
| 266 | + # run_torchrun "$CDIR/test_mp_early_exit.py" |
| 267 | + run_pt_xla_debug "$CDIR/debug_tool/test_mp_pt_xla_debug.py" |
| 268 | + run_test "$CDIR/torch_distributed/test_torch_distributed_all_gather_xla_backend.py" |
| 269 | + run_test "$CDIR/torch_distributed/test_torch_distributed_all_reduce_xla_backend.py" |
| 270 | + #run_test "$CDIR/torch_distributed/test_torch_distributed_bucketed_all_reduce_xla_backend.py" # crash without NEURONCORE_NUM_DEVICES=2 |
| 271 | + run_test "$CDIR/torch_distributed/test_torch_distributed_multi_all_reduce_xla_backend.py" |
| 272 | + run_test "$CDIR/torch_distributed/test_torch_distributed_reduce_scatter_xla_backend.py" |
| 273 | + run_test "$CDIR/torch_distributed/test_ddp.py" |
| 274 | + #run_test "$CDIR/torch_distributed/test_torch_distributed_fsdp_meta.py" # crash without NEURONCORE_NUM_DEVICES=2 |
| 275 | + PJRT_DEVICE=NEURON NEURONCORE_NUM_DEVICES=2 python3 $CDIR/torch_distributed/test_torch_distributed_all_gather_xla_backend.py |
| 276 | + PJRT_DEVICE=NEURON NEURONCORE_NUM_DEVICES=2 python3 $CDIR/torch_distributed/test_torch_distributed_all_reduce_xla_backend.py |
| 277 | + PJRT_DEVICE=NEURON NEURONCORE_NUM_DEVICES=2 python3 $CDIR/torch_distributed/test_torch_distributed_bucketed_all_reduce_xla_backend.py |
| 278 | + PJRT_DEVICE=NEURON NEURONCORE_NUM_DEVICES=2 python3 $CDIR/torch_distributed/test_torch_distributed_multi_all_reduce_xla_backend.py |
| 279 | + PJRT_DEVICE=NEURON NEURONCORE_NUM_DEVICES=2 python3 $CDIR/torch_distributed/test_torch_distributed_reduce_scatter_xla_backend.py |
| 280 | + PJRT_DEVICE=NEURON NEURONCORE_NUM_DEVICES=2 python3 $CDIR/torch_distributed/test_torch_distributed_fsdp_meta.py |
| 281 | +} |
| 282 | + |
| 283 | +function run_tests { |
| 284 | + # RUN_ flags filter an explicit test type to run, XLA_SKIP_ flags exclude one. |
| 285 | + if [[ "$RUN_XLA_OP_TESTS1" == "xla_op1" ]]; then |
| 286 | + echo "Running xla op tests..." |
| 287 | + run_xla_op_tests1 |
| 288 | + elif [[ "$RUN_XLA_OP_TESTS2" == "xla_op2" ]]; then |
| 289 | + echo "Running xla op tests..." |
| 290 | + run_xla_op_tests2 |
| 291 | + elif [[ "$RUN_XLA_OP_TESTS3" == "xla_op3" ]]; then |
| 292 | + echo "Running xla op tests..." |
| 293 | + run_xla_op_tests3 |
| 294 | + elif [[ "$RUN_TORCH_MP_OP_TESTS" == "torch_mp_op" ]]; then |
| 295 | + echo "Running torch op tests..." |
| 296 | + #run_torch_op_tests |
| 297 | + run_mp_op_tests |
| 298 | + else |
| 299 | + # Run full tests without sharding, respects XLA_SKIP_* |
| 300 | + if [[ "$XLA_SKIP_XLA_OP_TESTS" != "1" ]]; then |
| 301 | + run_xla_op_tests1 |
| 302 | + run_xla_op_tests2 |
| 303 | + run_xla_op_tests3 |
| 304 | + fi |
| 305 | + #if [[ "$XLA_SKIP_TORCH_OP_TESTS" != "1" ]]; then |
| 306 | + # run_torch_op_tests |
| 307 | + #fi |
| 308 | + if [[ "$XLA_SKIP_MP_OP_TESTS" != "1" ]]; then |
| 309 | + run_mp_op_tests |
| 310 | + fi |
| 311 | + fi |
| 312 | +} |
| 313 | + |
| 314 | +if [ "$LOGFILE" != "" ]; then |
| 315 | + run_tests 2>&1 | tee $LOGFILE |
| 316 | +else |
| 317 | + run_tests |
| 318 | +fi |
0 commit comments