Skip to content

Commit e657c87

Browse files
authored
Update tests for Neuron (#8053)
1 parent ea8c47a commit e657c87

13 files changed

+341
-13
lines changed

examples/train_decoder_only_base.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,10 @@ class TrainDecoderOnlyBase():
1818

1919
def __init__(self):
2020
self.config = DecoderOnlyConfig()
21-
self.batch_size = 16
21+
if xr.device_type() == 'NEURON':
22+
self.batch_size = 4
23+
else:
24+
self.batch_size = 16
2225
self.seq_len = 512
2326
self.num_steps = 200
2427
self.num_epochs = 1

test/dynamo/test_dynamo.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,12 @@ def _is_on_tpu():
3030
return xr.device_type() == 'TPU'
3131

3232

33+
def _is_on_neuron():
34+
return xr.device_type() == 'NEURON'
35+
36+
3337
skipOnTpu = unittest.skipIf(_is_on_tpu(), 'Not supported on TPU')
38+
skipOnNeuron = unittest.skipIf(_is_on_neuron(), 'Not supported on NEURON')
3439

3540

3641
class DynamoInPlaceTest(unittest.TestCase):
@@ -152,6 +157,7 @@ def _choose_proper_device(self, initialize_on_cuda):
152157
})
153158
return "cuda:0"
154159

160+
@skipOnNeuron
155161
def test_simple_model(self):
156162
device = xm.xla_device()
157163
x = torch.tensor(100.0)
@@ -361,6 +367,7 @@ def get_loader(self, device, sample_count, batch_size=4):
361367
return loader
362368

363369
@skipOnTpu
370+
@skipOnNeuron
364371
@parameterized.parameters(
365372
True,
366373
False,
@@ -393,6 +400,7 @@ def test_resnet18(self, initialize_on_cuda):
393400
self.assertEqual(
394401
met.metric_data('RunCachedGraphOutputData')[0], sample_count)
395402

403+
@skipOnNeuron
396404
def test_resnet18_lazy_vs_dynamo(self):
397405
sample_count = xu.getenv_as('SAMPLE_COUNT', int, defval=10)
398406
device = torch_xla.device()
@@ -555,6 +563,7 @@ def test_simple_model(self):
555563
input.grad, xla_input.grad.cpu(), rtol=1e-05, atol=1e-04))
556564

557565
@skipOnTpu
566+
@skipOnNeuron
558567
def test_resnet18(self):
559568
torch._dynamo.reset()
560569
met.clear_counters()

test/neuron/run_tests.sh

Lines changed: 316 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,318 @@
11
#!/bin/bash
2-
set -xue
2+
set -exo pipefail
3+
CDIR="$(cd "$(dirname "$0")"/../ ; pwd -P)"
4+
LOGFILE=/tmp/pytorch_py_test.log
5+
MAX_GRAPH_SIZE=500
6+
GRAPH_CHECK_FREQUENCY=100
7+
VERBOSITY=2
38

4-
python3 test/neuron/test_neuron_utils.py
9+
# Note [Keep Going]
10+
#
11+
# Set the `CONTINUE_ON_ERROR` flag to `true` to make the CI tests continue on error.
12+
# This will allow you to see all the failures on your PR, not stopping with the first
13+
# test failure like the default behavior.
14+
CONTINUE_ON_ERROR="${CONTINUE_ON_ERROR:-0}"
15+
if [[ "$CONTINUE_ON_ERROR" == "1" ]]; then
16+
set +e
17+
fi
18+
19+
while getopts 'LM:C:V:' OPTION
20+
do
21+
case $OPTION in
22+
L)
23+
LOGFILE=
24+
;;
25+
M)
26+
MAX_GRAPH_SIZE=$OPTARG
27+
;;
28+
C)
29+
GRAPH_CHECK_FREQUENCY=$OPTARG
30+
;;
31+
V)
32+
VERBOSITY=$OPTARG
33+
;;
34+
esac
35+
done
36+
shift $(($OPTIND - 1))
37+
38+
export TRIM_GRAPH_SIZE=$MAX_GRAPH_SIZE
39+
export TRIM_GRAPH_CHECK_FREQUENCY=$GRAPH_CHECK_FREQUENCY
40+
export TORCH_TEST_DEVICES="$CDIR/pytorch_test_base.py"
41+
export PYTORCH_TEST_WITH_SLOW=1
42+
export XLA_DUMP_FATAL_STACK=1
43+
export CPU_NUM_DEVICES=4
44+
45+
TORCH_XLA_DIR=$(cd ~; dirname "$(python -c 'import torch_xla; print(torch_xla.__file__)')")
46+
COVERAGE_FILE="$CDIR/../.coverage"
47+
48+
function run_coverage {
49+
if [ "${USE_COVERAGE:-0}" != "0" ]; then
50+
coverage run --source="$TORCH_XLA_DIR" -p "$@"
51+
else
52+
python3 "$@"
53+
fi
54+
}
55+
56+
function run_test {
57+
echo "Running in PjRt runtime: $@"
58+
PJRT_DEVICE=NEURON NEURON_NUM_DEVICES=1 run_coverage "$@"
59+
}
60+
61+
function run_test_without_functionalization {
62+
echo "Running with XLA_DISABLE_FUNCTIONALIZATION: $@"
63+
XLA_DISABLE_FUNCTIONALIZATION=1 run_test "$@"
64+
}
65+
66+
function run_xla_ir_debug {
67+
echo "Running with XLA_IR_DEBUG: $@"
68+
XLA_IR_DEBUG=1 run_test "$@"
69+
}
70+
71+
function run_use_bf16 {
72+
echo "Running with XLA_USE_BF16: $@"
73+
XLA_USE_BF16=1 run_test "$@"
74+
}
75+
76+
function run_downcast_bf16 {
77+
echo "Running with XLA_DOWNCAST_BF16: $@"
78+
XLA_DOWNCAST_BF16=1 run_test "$@"
79+
}
80+
81+
function run_xla_hlo_debug {
82+
echo "Running with XLA_IR_DEBUG: $@"
83+
XLA_HLO_DEBUG=1 run_test "$@"
84+
}
85+
86+
function run_dynamic {
87+
echo "Running in DynamicShape mode: $@"
88+
XLA_EXPERIMENTAL="nonzero:masked_select:masked_scatter:nms" run_test "$@"
89+
}
90+
91+
function run_eager_debug {
92+
echo "Running in Eager Debug mode: $@"
93+
XLA_USE_EAGER_DEBUG_MODE=1 run_test "$@"
94+
}
95+
96+
function run_save_tensor_ir {
97+
echo "Running in save tensor file mode: $@"
98+
XLA_SAVE_TENSORS_FILE="/tmp/xla_test_save_ir.txt" XLA_SAVE_TENSORS_FMT="text" run_test "$@"
99+
}
100+
101+
function run_save_tensor_hlo {
102+
echo "Running in save tensor file mode: $@"
103+
XLA_SAVE_TENSORS_FILE="/tmp/xla_test_save_ir.txt" XLA_SAVE_TENSORS_FMT="hlo" run_test "$@"
104+
}
105+
106+
function run_pt_xla_debug {
107+
echo "Running in save tensor file mode: $@"
108+
PT_XLA_DEBUG=1 PT_XLA_DEBUG_FILE="/tmp/pt_xla_debug.txt" run_test "$@"
109+
}
110+
111+
function run_pt_xla_debug_level1 {
112+
echo "Running in save tensor file mode: $@"
113+
PT_XLA_DEBUG_LEVEL=1 PT_XLA_DEBUG_FILE="/tmp/pt_xla_debug.txt" run_test "$@"
114+
}
115+
116+
function run_torchrun {
117+
PJRT_DEVICE=NEURON torchrun --nnodes 1 --nproc-per-node 2 $@
118+
}
119+
120+
function run_torch_op_tests {
121+
run_dynamic "$CDIR/../../test/test_view_ops.py" "$@" -v TestViewOpsXLA
122+
run_test_without_functionalization "$CDIR/../../test/test_view_ops.py" "$@" -v TestViewOpsXLA
123+
run_test "$CDIR/../../test/test_torch.py" "$@" -v TestTorchDeviceTypeXLA
124+
run_dynamic "$CDIR/../../test/test_torch.py" "$@" -v TestDevicePrecisionXLA
125+
run_test "$CDIR/../../test/test_torch.py" "$@" -v TestTensorDeviceOpsXLA
126+
run_test "$CDIR/../../test/test_indexing.py" "$@" -v TestIndexingXLA
127+
run_test "$CDIR/../../test/test_indexing.py" "$@" -v NumpyTestsXLA
128+
# run_dynamic "$CDIR/../../test/test_nn.py" "$@" -v TestNNDeviceTypeXLA
129+
run_dynamic "$CDIR/../../test/nn/test_dropout.py" "$@" -v TestDropoutNNDeviceTypeXLA
130+
run_dynamic "$CDIR/../../test/nn/test_pooling.py" "$@" -v TestPoolingNNDeviceTypeXLA
131+
run_dynamic "$CDIR/../../test/nn/test_embedding.py" "$@" -v TestEmbeddingNNDeviceTypeXLA
132+
run_dynamic "$CDIR/../../test/nn/test_convolution.py" "$@" -v TestConvolutionNNDeviceTypeXLA
133+
run_dynamic "$CDIR/../../test/nn/test_multihead_attention.py" "$@" -v TestMultiheadAttentionNNDeviceTypeXLA
134+
run_dynamic "$CDIR/../../test/test_type_promotion.py" "$@" -v TestTypePromotionXLA
135+
}
136+
137+
#######################################################################################
138+
################################# XLA OP TESTS SHARDS #################################
139+
#######################################################################################
140+
141+
# DO NOT MODIFY
142+
function run_xla_op_tests1 {
143+
#run_dynamic "$CDIR/test_operations.py" "$@" --verbosity=$VERBOSITY
144+
#run_dynamic "$CDIR/ds/test_dynamic_shapes.py"
145+
#run_dynamic "$CDIR/ds/test_dynamic_shape_models.py" "$@" --verbosity=$VERBOSITY
146+
#run_eager_debug "$CDIR/test_operations.py" "$@" --verbosity=$VERBOSITY
147+
#run_test "$CDIR/test_operations.py" "$@" --verbosity=$VERBOSITY
148+
#run_test_without_functionalization "$CDIR/test_operations.py" "$@" --verbosity=$VERBOSITY
149+
run_pt_xla_debug "$CDIR/debug_tool/test_pt_xla_debug.py"
150+
run_pt_xla_debug_level1 "$CDIR/debug_tool/test_pt_xla_debug.py"
151+
run_test "$CDIR/test_async_closures.py"
152+
run_test "$CDIR/test_hlo_metadata.py"
153+
#run_test "$CDIR/test_profiler.py"
154+
run_test "$CDIR/pjrt/test_runtime.py"
155+
#NEURONCORE_NUM_DEVICES=2 python "$CDIR/pjrt/test_ddp.py"
156+
run_test "$CDIR/pjrt/test_mesh_service.py"
157+
#run_test "$CDIR/test_python_ops.py"
158+
#run_test "$CDIR/test_ops.py"
159+
run_test "$CDIR/test_metrics.py"
160+
run_test "$CDIR/test_deprecation.py"
161+
run_test "$CDIR/dynamo/test_dynamo_integrations_util.py"
162+
#run_test "$CDIR/dynamo/test_dynamo_aliasing.py"
163+
run_test "$CDIR/dynamo/test_dynamo.py"
164+
run_test "$CDIR/dynamo/test_dynamo_dynamic_shape.py"
165+
run_test "$CDIR/dynamo/test_bridge.py"
166+
run_test "$CDIR/dynamo/test_num_output.py"
167+
run_test "$CDIR/dynamo/test_graph_input_matcher.py"
168+
run_test "$CDIR/dynamo/test_dynamo_config.py"
169+
run_save_tensor_ir "$CDIR/dynamo/test_dynamo_graph_dump.py"
170+
#run_test "$CDIR/test_data_type.py"
171+
run_use_bf16 "$CDIR/test_data_type.py"
172+
run_downcast_bf16 "$CDIR/test_data_type.py"
173+
#run_test "$CDIR/test_fp8.py"
174+
run_xla_ir_debug "$CDIR/test_env_var_mapper.py"
175+
run_xla_hlo_debug "$CDIR/test_env_var_mapper.py"
176+
run_xla_hlo_debug "$CDIR/stablehlo/test_stablehlo_save_load.py"
177+
run_save_tensor_ir "$CDIR/spmd/test_spmd_graph_dump.py"
178+
run_save_tensor_hlo "$CDIR/spmd/test_spmd_graph_dump.py"
179+
}
180+
181+
function run_xla_op_tests2 {
182+
run_test "$CDIR/pjrt/test_dtypes.py"
183+
#run_test "$CDIR/test_while_loop.py"
184+
run_test "$CDIR/test_scan.py"
185+
run_test "$CDIR/test_autocast.py"
186+
run_test "$CDIR/test_grad_checkpoint.py"
187+
#run_test "$CDIR/eager/test_eager.py"
188+
run_test "$CDIR/eager/test_eager_with_xla_compile.py"
189+
run_test "$CDIR/eager/test_eager_with_torch_compile.py"
190+
#run_test "$CDIR/eager/test_eager_all_reduce_in_place.py"
191+
run_test "$CDIR/eager/test_eager_spmd.py"
192+
run_test "$CDIR/test_callback.py"
193+
XLA_USE_SPMD=1 run_test "$CDIR/test_callback.py"
194+
}
195+
196+
# All the new xla op tests should go to run_xla_op_tests3
197+
function run_xla_op_tests3 {
198+
# TODO(qihqi): this test require tensorflow to run. need to setup separate
199+
# CI with tf.
200+
run_test "$CDIR/stablehlo/test_exports.py"
201+
run_test "$CDIR/stablehlo/test_export_fx_passes.py"
202+
run_test "$CDIR/stablehlo/test_implicit_broadcasting.py"
203+
run_test "$CDIR/stablehlo/test_composite.py"
204+
run_test "$CDIR/stablehlo/test_pt2e_qdq.py"
205+
run_test "$CDIR/stablehlo/test_stablehlo_custom_call.py"
206+
#run_xla_hlo_debug "$CDIR/stablehlo/test_stablehlo_inference.py"
207+
#=run_test "$CDIR/stablehlo/test_stablehlo_compile.py"
208+
run_test "$CDIR/stablehlo/test_unbounded_dynamism.py"
209+
#run_test "$CDIR/quantized_ops/test_quantized_matmul.py"
210+
#run_test "$CDIR/quantized_ops/test_dot_general.py"
211+
#run_test "$CDIR/spmd/test_xla_sharding.py"
212+
run_test "$CDIR/spmd/test_xla_sharding_hlo.py"
213+
#run_test "$CDIR/spmd/test_xla_virtual_device.py"
214+
#run_test "$CDIR/spmd/test_dynamo_spmd.py"
215+
run_test "$CDIR/spmd/test_spmd_debugging.py"
216+
#=run_test "$CDIR/spmd/test_xla_distributed_checkpoint.py"
217+
run_test "$CDIR/spmd/test_xla_spmd_python_api_interaction.py"
218+
#run_test "$CDIR/spmd/test_dtensor_integration.py"
219+
#run_test "$CDIR/spmd/test_dtensor_integration2.py"
220+
run_test "$CDIR/spmd/test_xla_auto_sharding.py"
221+
#run_test "$CDIR/spmd/test_spmd_parameter_wrapping.py"
222+
run_test "$CDIR/spmd/test_train_spmd_linear_model.py"
223+
run_test "$CDIR/spmd/test_xla_spmd_python_api_interaction.py"
224+
run_test "$CDIR/spmd/test_xla_auto_sharding.py"
225+
run_test "$CDIR/spmd/test_fsdp_v2.py"
226+
run_test "$CDIR/test_operations_hlo.py" "$@" --verbosity=$VERBOSITY
227+
run_test "$CDIR/test_input_output_aliases.py"
228+
run_test "$CDIR/test_torch_distributed_xla_backend.py"
229+
run_torchrun "$CDIR/pjrt/test_torchrun.py"
230+
run_test "$CDIR/test_persistent_cache.py"
231+
run_test "$CDIR/test_devices.py"
232+
233+
run_test "$CDIR/neuron/test_neuron_utils.py"
234+
235+
#python3 examples/data_parallel/train_resnet_xla_ddp.py # compiler error
236+
#python3 examples/fsdp/train_resnet_fsdp_auto_wrap.py
237+
#python3 examples/eager/train_decoder_only_eager.py # OOM
238+
#python3 examples/eager/train_decoder_only_eager_spmd_data_parallel.py # compiler err due to f64
239+
PJRT_DEVICE=NEURON NEURONCORE_NUM_DEVICES=1 python3 examples/eager/train_decoder_only_eager_with_compile.py # nan loss expected?
240+
PJRT_DEVICE=NEURON NEURONCORE_NUM_DEVICES=1 python3 examples/eager/train_decoder_only_eager_multi_process.py
241+
}
242+
243+
#######################################################################################
244+
245+
function run_op_tests {
246+
run_torch_op_tests
247+
run_xla_op_tests1
248+
run_xla_op_tests2
249+
run_xla_op_tests3
250+
}
251+
252+
function run_mp_op_tests {
253+
run_test "$CDIR/test_mp_replication.py"
254+
#run_test "$CDIR/test_mp_all_to_all.py"
255+
run_test "$CDIR/test_mp_collective_permute.py"
256+
#run_test "$CDIR/test_mp_all_gather.py" # "wrong reductions"?
257+
run_test "$CDIR/test_mp_reduce_scatter.py"
258+
run_test "$CDIR/test_zero1.py"
259+
run_test "$CDIR/test_mp_distributed_mm.py"
260+
run_test "$CDIR/test_mp_save.py"
261+
run_test "$CDIR/test_mp_mesh_reduce.py"
262+
run_test "$CDIR/test_mp_sync_batch_norm.py"
263+
# TODO(JackCaoG): enable this
264+
run_test "$CDIR/dynamo/test_traceable_collectives.py"
265+
run_test "$CDIR/test_fsdp_auto_wrap.py"
266+
# run_torchrun "$CDIR/test_mp_early_exit.py"
267+
run_pt_xla_debug "$CDIR/debug_tool/test_mp_pt_xla_debug.py"
268+
run_test "$CDIR/torch_distributed/test_torch_distributed_all_gather_xla_backend.py"
269+
run_test "$CDIR/torch_distributed/test_torch_distributed_all_reduce_xla_backend.py"
270+
#run_test "$CDIR/torch_distributed/test_torch_distributed_bucketed_all_reduce_xla_backend.py" # crash without NEURONCORE_NUM_DEVICES=2
271+
run_test "$CDIR/torch_distributed/test_torch_distributed_multi_all_reduce_xla_backend.py"
272+
run_test "$CDIR/torch_distributed/test_torch_distributed_reduce_scatter_xla_backend.py"
273+
run_test "$CDIR/torch_distributed/test_ddp.py"
274+
#run_test "$CDIR/torch_distributed/test_torch_distributed_fsdp_meta.py" # crash without NEURONCORE_NUM_DEVICES=2
275+
PJRT_DEVICE=NEURON NEURONCORE_NUM_DEVICES=2 python3 $CDIR/torch_distributed/test_torch_distributed_all_gather_xla_backend.py
276+
PJRT_DEVICE=NEURON NEURONCORE_NUM_DEVICES=2 python3 $CDIR/torch_distributed/test_torch_distributed_all_reduce_xla_backend.py
277+
PJRT_DEVICE=NEURON NEURONCORE_NUM_DEVICES=2 python3 $CDIR/torch_distributed/test_torch_distributed_bucketed_all_reduce_xla_backend.py
278+
PJRT_DEVICE=NEURON NEURONCORE_NUM_DEVICES=2 python3 $CDIR/torch_distributed/test_torch_distributed_multi_all_reduce_xla_backend.py
279+
PJRT_DEVICE=NEURON NEURONCORE_NUM_DEVICES=2 python3 $CDIR/torch_distributed/test_torch_distributed_reduce_scatter_xla_backend.py
280+
PJRT_DEVICE=NEURON NEURONCORE_NUM_DEVICES=2 python3 $CDIR/torch_distributed/test_torch_distributed_fsdp_meta.py
281+
}
282+
283+
function run_tests {
284+
# RUN_ flags filter an explicit test type to run, XLA_SKIP_ flags exclude one.
285+
if [[ "$RUN_XLA_OP_TESTS1" == "xla_op1" ]]; then
286+
echo "Running xla op tests..."
287+
run_xla_op_tests1
288+
elif [[ "$RUN_XLA_OP_TESTS2" == "xla_op2" ]]; then
289+
echo "Running xla op tests..."
290+
run_xla_op_tests2
291+
elif [[ "$RUN_XLA_OP_TESTS3" == "xla_op3" ]]; then
292+
echo "Running xla op tests..."
293+
run_xla_op_tests3
294+
elif [[ "$RUN_TORCH_MP_OP_TESTS" == "torch_mp_op" ]]; then
295+
echo "Running torch op tests..."
296+
#run_torch_op_tests
297+
run_mp_op_tests
298+
else
299+
# Run full tests without sharding, respects XLA_SKIP_*
300+
if [[ "$XLA_SKIP_XLA_OP_TESTS" != "1" ]]; then
301+
run_xla_op_tests1
302+
run_xla_op_tests2
303+
run_xla_op_tests3
304+
fi
305+
#if [[ "$XLA_SKIP_TORCH_OP_TESTS" != "1" ]]; then
306+
# run_torch_op_tests
307+
#fi
308+
if [[ "$XLA_SKIP_MP_OP_TESTS" != "1" ]]; then
309+
run_mp_op_tests
310+
fi
311+
fi
312+
}
313+
314+
if [ "$LOGFILE" != "" ]; then
315+
run_tests 2>&1 | tee $LOGFILE
316+
else
317+
run_tests
318+
fi

test/test_mp_all_gather.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,9 @@ def _mp_fn(index):
3434
cpu_result = result.cpu()
3535
expected = torch.arange(0, world_size, dtype=torch.float)
3636
if not cpu_result.allclose(expected):
37-
print('xm.all_gather() produced wrong reductions', file=sys.stderr)
37+
print(
38+
'xm.all_gather() produced wrong reductions (torch.compile)',
39+
file=sys.stderr)
3840
print(f'[{index}] {cpu_result}', file=sys.stderr)
3941
sys.exit(1)
4042

test/test_mp_all_to_all.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
def _mp_fn(index):
99
device = xm.xla_device()
10-
if xm.xla_device_hw(device) == 'TPU':
10+
if xm.xla_device_hw(device) in ('TPU', 'NEURON'):
1111
slots_per_device = 4
1212
size = slots_per_device * xr.world_size()
1313
ordinal = xr.global_ordinal()

0 commit comments

Comments
 (0)