1+ #! /bin/bash
2+ # Copyright 2024 The JAX Authors.
3+ #
4+ # Licensed under the Apache License, Version 2.0 (the "License");
5+ # you may not use this file except in compliance with the License.
6+ # You may obtain a copy of the License at
7+ #
8+ # http://www.apache.org/licenses/LICENSE-2.0
9+ #
10+ # Unless required by applicable law or agreed to in writing, software
11+ # distributed under the License is distributed on an "AS IS" BASIS,
12+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+ # See the License for the specific language governing permissions and
14+ # limitations under the License.
15+ # ==============================================================================
16+ # Run Bazel GPU tests without RBE. This runs two commands: single accelerator
17+ # tests with one GPU a piece, multiaccelerator tests with all GPUS.
18+ # Requires that jaxlib, jax-cuda-plugin, and jax-cuda-pjrt wheels are stored
19+ # inside the ../dist folder
20+ #
21+ # -e: abort script if one command fails
22+ # -u: error if undefined variable used
23+ # -x: log all commands
24+ # -o history: record shell history
25+ # -o allexport: export all functions and variables to be available to subscripts
26+ set -exu -o history -o allexport
27+
28+ # Source default JAXCI environment variables.
29+ source ci/envs/default.env
30+
31+ # Set up the build environment.
32+ source " ci/utilities/setup_build_environment.sh"
33+
34+ # Run Bazel GPU tests (single accelerator and multiaccelerator tests) directly
35+ # on the VM without RBE.
36+ nvidia-smi
37+ echo " Running single accelerator tests (without RBE)..."
38+
39+ # Set up test environment variables.
40+ export gpu_count=$( nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
41+ export num_test_jobs=$(( gpu_count * JAXCI_MAX_TESTS_PER_GPU))
42+ export num_cpu_cores=$( nproc)
43+
44+ # tests_jobs = max(gpu_count * max_tests_per_gpu, num_cpu_cores)
45+ if [[ $num_test_jobs -gt $num_cpu_cores ]]; then
46+ num_test_jobs=$num_cpu_cores
47+ fi
48+ # End of test environment variables setup.
49+
50+ # Runs single accelerator tests with one GPU apiece.
51+ # It appears --run_under needs an absolute path.
52+ # The product of the `JAX_ACCELERATOR_COUNT`` and `JAX_TESTS_PER_ACCELERATOR`
53+ # should match the VM's CPU core count (set in `--local_test_jobs`).
54+ bazel test --config=ci_linux_x86_64_cuda \
55+ --repo_env=HERMETIC_PYTHON_VERSION=" $JAXCI_HERMETIC_PYTHON_VERSION " \
56+ --//jax:build_jaxlib=false \
57+ --test_env=XLA_PYTHON_CLIENT_ALLOCATOR=platform \
58+ --run_under " $( pwd) /build/parallel_accelerator_execute.sh" \
59+ --test_output=errors \
60+ --test_env=JAX_ACCELERATOR_COUNT=$gpu_count \
61+ --test_env=JAX_TESTS_PER_ACCELERATOR=$JAXCI_MAX_TESTS_PER_GPU \
62+ --local_test_jobs=$num_test_jobs \
63+ --test_env=JAX_EXCLUDE_TEST_TARGETS=PmapTest.testSizeOverflow \
64+ --test_tag_filters=-multiaccelerator \
65+ --test_env=TF_CPP_MIN_LOG_LEVEL=0 \
66+ --test_env=JAX_SKIP_SLOW_TESTS=true \
67+ --action_env=JAX_ENABLE_X64=" $JAXCI_ENABLE_X64 " \
68+ --action_env=NCCL_DEBUG=WARN \
69+ --color=yes \
70+ //tests:gpu_tests //tests:backend_independent_tests \
71+ //tests/pallas:gpu_tests //tests/pallas:backend_independent_tests
72+
73+ echo " Running multi-accelerator tests (without RBE)..."
74+ # Runs multiaccelerator tests with all GPUs directly on the VM without RBE..
75+ bazel test --config=ci_linux_x86_64_cuda \
76+ --repo_env=HERMETIC_PYTHON_VERSION=" $JAXCI_HERMETIC_PYTHON_VERSION " \
77+ --//jax:build_jaxlib=false \
78+ --test_env=XLA_PYTHON_CLIENT_ALLOCATOR=platform \
79+ --test_output=errors \
80+ --jobs=8 \
81+ --test_tag_filters=multiaccelerator \
82+ --test_env=TF_CPP_MIN_LOG_LEVEL=0 \
83+ --test_env=JAX_SKIP_SLOW_TESTS=true \
84+ --action_env=JAX_ENABLE_X64=" $JAXCI_ENABLE_X64 " \
85+ --action_env=NCCL_DEBUG=WARN \
86+ --color=yes \
87+ //tests:gpu_tests //tests/pallas:gpu_tests
0 commit comments