-
Notifications
You must be signed in to change notification settings - Fork 6.2k
/
Copy pathrelease_data_tests.yaml
495 lines (388 loc) · 12 KB
/
release_data_tests.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
- name: DEFAULTS
group: data-tests
working_dir: nightly_tests/dataset
frequency: nightly
team: data
cluster:
byod:
runtime_env:
# Enable verbose stats for resource manager (to troubleshoot autoscaling)
- RAY_DATA_DEBUG_RESOURCE_MANAGER=1
# 'type: gpu' means: use the 'ray-ml' image.
type: gpu
cluster_compute: fixed_size_cpu_compute.yaml
###############
# Reading tests
###############
- name: "read_parquet_{{scaling}}"
cluster:
cluster_compute: "{{scaling}}_cpu_compute.yaml"
matrix:
setup:
scaling: [fixed_size, autoscaling]
run:
timeout: 3600
script: >
python read_and_consume_benchmark.py
s3://ray-benchmark-data-internal/imagenet/parquet --format parquet
--iter-bundles
- name: "read_images_{{scaling}}"
cluster:
cluster_compute: "{{scaling}}_cpu_compute.yaml"
matrix:
setup:
scaling: [fixed_size, autoscaling]
run:
timeout: 3600
script: >
python read_and_consume_benchmark.py
s3://anyscale-imagenet/ILSVRC/Data/CLS-LOC/ --format image --iter-bundles
- name: read_tfrecords
run:
timeout: 3600
script: >
python read_and_consume_benchmark.py
s3://ray-benchmark-data-internal/imagenet/tfrecords --format tfrecords
--iter-bundles
- name: "read_from_uris_{{scaling}}"
cluster:
cluster_compute: "{{scaling}}_cpu_compute.yaml"
matrix:
setup:
scaling: [fixed_size, autoscaling]
run:
timeout: 5400
script: python read_from_uris_benchmark.py
- name: read_images_comparison_microbenchmark_single_node
frequency: manual
cluster:
byod:
post_build_script: byod_install_mosaicml.sh
cluster_compute: single_worker_node_0_head_node_benchmark_compute.yaml
run:
timeout: 1800
script: bash run_image_loader_microbenchmark.sh
###############
# Writing tests
###############
- name: write_parquet
run:
timeout: 3600
script: >
python read_and_consume_benchmark.py
s3://ray-benchmark-data/tpch/parquet/sf1000/lineitem --format parquet --write
###################
# Aggregation tests
###################
- name: "count_parquet_{{scaling}}"
cluster:
cluster_compute: "{{scaling}}_cpu_compute.yaml"
matrix:
setup:
scaling: [fixed_size, autoscaling]
run:
timeout: 600
script: >
python read_and_consume_benchmark.py
s3://ray-benchmark-data/tpch/parquet/sf10000/lineitem --format parquet --count
###############
# Groupby tests
###############
# The groupby tests use the TPC-H lineitem table. Here are the columns used for the
# groupbys and their corresponding TPC-H column names:
#
# | Our dataset | TPC-H column name |
# |-----------------|-------------------|
# | column02 | l_suppkey |
# | column08 | l_returnflag |
# | column13 | l_shipinstruct |
# | column14 | l_shipmode |
#
# Here are the number of groups for different groupby columns in SF 1000:
#
# | Groupby columns | Number of groups |
# |----------------------------------|------------------|
# | column08, column13, column14 | 84 |
# | column02, column14 | 7,000,000 |
#
# The SF (scale factor) 1000 lineitem table contains ~6B rows.
# TODO: Bump the scale from SF10 to SF1000 once we handle the scale.
- name: "aggregate_groups_{{scaling}}_{{shuffle_strategy}}_{{columns}}"
matrix:
setup:
scaling: [fixed_size, autoscaling]
shuffle_strategy: [sort_shuffle_pull_based]
columns:
- "column08 column13 column14" # 84 groups
- "column02 column14" # 7M groups
cluster:
cluster_compute: "{{scaling}}_all_to_all_compute.yaml"
run:
timeout: 3600
script: >
python groupby_benchmark.py --sf 10 --aggregate --group-by {{columns}}
--shuffle-strategy {{shuffle_strategy}}
- name: "map_groups_{{scaling}}_{{shuffle_strategy}}_{{columns}}"
matrix:
setup:
# This test consistently fails on fixed-size clusters due to head OOM from
# too many objects references on the head node. So, we only run it on
# autoscaling clusters.
scaling: [autoscaling]
shuffle_strategy: [sort_shuffle_pull_based]
columns:
- "column08 column13 column14" # 84 groups
- "column02 column14" # 7M groups
cluster:
cluster_compute: "{{scaling}}_all_to_all_compute.yaml"
run:
timeout: 3600
script: >
python groupby_benchmark.py --sf 10 --map-groups --group-by {{columns}}
--shuffle-strategy {{shuffle_strategy}}
#######################
# Streaming split tests
#######################
- name: streaming_split
run:
timeout: 300
script: python streaming_split_benchmark.py --num-workers 10
wait_for_nodes:
num_nodes: 10
variations:
- __suffix__: regular
- __suffix__: early_stop
# This test case will early stop the data ingestion iteration on the GPU actors.
# This is a common usage in PyTorch Lightning
# (https://lightning.ai/docs/pytorch/stable/common/trainer.html#limit-train-batches).
# There was a bug in Ray Data that caused GPU memory leak (see #34819).
# We add this test case to cover this scenario.
run:
script: python streaming_split_benchmark.py --num-workers 10 --early-stop
################
# Training tests
################
- name: distributed_training
working_dir: nightly_tests
cluster:
byod:
post_build_script: byod_install_mosaicml.sh
cluster_compute: dataset/multi_node_train_16_workers.yaml
run:
timeout: 3600
script: >
python dataset/multi_node_train_benchmark.py --num-workers 16 --file-type parquet
--target-worker-gb 50 --use-gpu
variations:
- __suffix__: regular
- __suffix__: chaos
run:
prepare: >
python setup_chaos.py --kill-interval 200 --max-to-kill 1 --task-names
"_RayTrainWorker__execute.get_next"
#################
# Iteration tests
#################
- name: "iter_batches_{{format}}"
matrix:
setup:
format: [numpy, pandas, pyarrow]
run:
timeout: 2400
script: >
python read_and_consume_benchmark.py
s3://ray-benchmark-data/tpch/parquet/sf10/lineitem --format parquet
--iter-batches {{format}}
- name: to_tf
run:
timeout: 2400
script: >
python read_and_consume_benchmark.py
s3://air-example-data-2/100G-image-data-synthetic-raw/ --format image
--to-tf image image
- name: iter_torch_batches
cluster:
cluster_compute: fixed_size_gpu_head_compute.yaml
run:
timeout: 2400
script: >
python read_and_consume_benchmark.py
s3://air-example-data-2/100G-image-data-synthetic-raw/ --format image
--iter-torch-batches
###########
# Map tests
###########
- name: map
run:
timeout: 1800
script: python map_benchmark.py --api map --sf 10
- name: flat_map
run:
timeout: 1800
script: python map_benchmark.py --api flat_map --sf 10
- name: "map_batches_{{scaling}}_{{compute}}_{{format}}"
matrix:
setup:
format: [numpy, pandas, pyarrow]
compute: [tasks]
scaling: [fixed_size]
adjustments:
# Include at least one test with actors.
- with:
format: numpy
compute: actors
scaling: fixed_size
# Include at least one autoscaling test.
- with:
format: numpy
compute: tasks
scaling: autoscaling
cluster:
cluster_compute: "{{scaling}}_cpu_compute.yaml"
run:
timeout: 5400
script: >
python map_benchmark.py --api map_batches --batch-format {{format}}
--compute {{compute}} --sf 1000
########################
# Sort and shuffle tests
########################
- name: "random_shuffle_{{scaling}}"
matrix:
setup:
# This release test consistently fails on autoscaling clusters. So, we only run
# it on fixed-size clusters. The reason for the failure is unclear.
scaling: [fixed_size]
cluster:
byod:
runtime_env:
- RAY_worker_killing_policy=retriable_lifo
pip:
- ray[default]
cluster_compute: "{{scaling}}_all_to_all_compute.yaml"
run:
timeout: 10800
script: >
python sort_benchmark.py --num-partitions=1000 --partition-size=1e9 --shuffle
- name: random_shuffle_chaos
working_dir: nightly_tests
stable: False
cluster:
byod:
runtime_env:
- RAY_worker_killing_policy=retriable_lifo
pip:
- ray[default]
cluster_compute: dataset/autoscaling_all_to_all_compute.yaml
run:
timeout: 10800
prepare: >
python setup_chaos.py --chaos TerminateEC2Instance --kill-interval 600
--max-to-kill 2
script: >
python dataset/sort_benchmark.py --num-partitions=1000 --partition-size=1e9
--shuffle
- name: "sort_{{scaling}}"
# This test intermittently fails due to Arrow offset overflow errors, or OOD from
# overly-conservative autoscaling.
stable: False
matrix:
setup:
scaling: [fixed_size, autoscaling]
cluster:
byod:
runtime_env:
- RAY_worker_killing_policy=retriable_lifo
pip:
- ray[default]
cluster_compute: "{{scaling}}_all_to_all_compute.yaml"
run:
timeout: 10800
script: python sort_benchmark.py --num-partitions=1000 --partition-size=1e9
- name: sort_chaos
working_dir: nightly_tests
stable: False
cluster:
byod:
runtime_env:
- RAY_worker_killing_policy=retriable_lifo
pip:
- ray[default]
cluster_compute: dataset/autoscaling_all_to_all_compute.yaml
run:
timeout: 10800
prepare: >
python setup_chaos.py --chaos TerminateEC2Instance --kill-interval 900
--max-to-kill 3
script: python dataset/sort_benchmark.py --num-partitions=1000 --partition-size=1e9
#######################
# Batch inference tests
#######################
# 300 GB image classification parquet data up to 10 GPUs
# 10 g4dn.12xlarge.
- name: "batch_inference_{{scaling}}"
cluster:
cluster_compute: "{{scaling}}_gpu_compute.yaml"
matrix:
setup:
scaling: [fixed_size, autoscaling]
run:
timeout: 1800
script: >
python gpu_batch_inference.py
--data-directory 300G-image-data-synthetic-raw-parquet --data-format parquet
- name: batch_inference_from_metadata
# This benchmark errors because of the issues described in PLAN-383.
frequency: manual
cluster:
cluster_compute: autoscaling_hetero_compute.yaml
run:
timeout: 1800
script: python batch_inference_benchmark.py
- name: batch_inference_chaos
stable: False
# Don't use 'nightly_tests/dataset' as the working directory because we need to run
# the 'setup_chaos.py' script.
working_dir: nightly_tests
cluster:
cluster_compute: dataset/autoscaling_gpu_compute.yaml
run:
timeout: 1800
prepare: python setup_chaos.py --chaos TerminateEC2Instance --batch-size-to-kill 2 --max-to-kill 6 --kill-delay 30
script: >
python dataset/gpu_batch_inference.py
--data-directory 300G-image-data-synthetic-raw-parquet --data-format parquet --chaos-test
- name: batch_inference_chaos_no_scale_back
stable: False
working_dir: nightly_tests
cluster:
cluster_compute: dataset/autoscaling_gpu_compute.yaml
run:
timeout: 1800
prepare: python setup_cluster_compute_config_updater.py --updates worker_nodes.0.max_nodes:5:240
script: >
python dataset/gpu_batch_inference.py
--data-directory 300G-image-data-synthetic-raw-parquet --data-format parquet --chaos-test
# 10 TB image classification parquet data with autoscaling heterogenous cluster
# 10 g4dn.12xlarge, 10 m5.16xlarge
- name: batch_inference_hetero
frequency: weekly
cluster:
cluster_compute: autoscaling_hetero_compute.yaml
run:
timeout: 7200
script: >
python gpu_batch_inference.py
--data-directory 10T-image-data-synthetic-raw-parquet --data-format parquet
##############
# TPCH Queries
##############
- name: "tpch_q1_{{scaling}}"
matrix:
setup:
scaling: [fixed_size, autoscaling]
cluster:
cluster_compute: "{{scaling}}_all_to_all_compute.yaml"
run:
timeout: 5400
script: python tpch_q1.py --sf 100