diff --git a/.github/workflows/build_docs.yml b/.github/workflows/build_docs.yml index 9d8153bb..d1321605 100644 --- a/.github/workflows/build_docs.yml +++ b/.github/workflows/build_docs.yml @@ -15,7 +15,7 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set up Python uses: actions/setup-python@v4 diff --git a/.github/workflows/lint_code.yml b/.github/workflows/lint_code.yml index 54350df2..2be61def 100644 --- a/.github/workflows/lint_code.yml +++ b/.github/workflows/lint_code.yml @@ -14,7 +14,7 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set up Python uses: actions/setup-python@v4 diff --git a/.github/workflows/unit_test.yml b/.github/workflows/unit_test.yml new file mode 100644 index 00000000..eb1408c4 --- /dev/null +++ b/.github/workflows/unit_test.yml @@ -0,0 +1,94 @@ +name: Run Unit Tests +on: + push: + branches: + - main + pull_request: + branches: + - main + +jobs: + + build_wheels: + runs-on: ubuntu-24.04 + container: + image: 'nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04' + steps: + - name: Update GCC + run: | + export DEBIAN_FRONTEND=noninteractive + apt update && apt install -y build-essential gcc-10 g++-10 + - name: Install Python versions and pips + run: | + export DEBIAN_FRONTEND=noninteractive + apt update && apt install -y software-properties-common curl + add-apt-repository ppa:deadsnakes/ppa + apt-get install -y python3.10 python3.10-dev python3.10-distutils + apt-get install -y python3.11 python3.11-dev python3.11-distutils + apt-get install -y python3.12 python3.12-dev python3.12-distutils + curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10 + curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11 + curl -sS https://bootstrap.pypa.io/get-pip.py | python3.12 + - name: Checkout code + uses: actions/checkout@v4 + - name: Build wheel with Python 3.10 + run: | + python3.10 -m pip install -U poetry build six + python3.10 -m poetry build -f wheel + - name: Build wheel with Python 3.11 + run: | + python3.11 -m pip install -U poetry build six + python3.11 -m poetry build -f wheel + - name: Build wheel with Python 3.12 + run: | + python3.12 -m pip install -U poetry build six + python3.12 -m poetry build -f wheel + - name: Upload the wheel artifact + uses: actions/upload-artifact@v4 + with: + name: resiliency-wheels + path: dist/*.whl + + unit_tests: + runs-on: ubuntu-24.04 + needs: build_wheels + strategy: + matrix: + container: + - 'pytorch/pytorch:2.4.1-cuda12.1-cudnn9-runtime' + - 'pytorch/pytorch:2.2.2-cuda12.1-cudnn8-runtime' + - 'pytorch/pytorch:2.1.0-cuda11.8-cudnn8-runtime' + test_type: ['fault_tolerance', 'straggler', 'ptl_resiliency'] + container: + image: ${{ matrix.container }} + env: + MKL_SERVICE_FORCE_INTEL: 1 # Fix for "MKL_THREADING_LAYER=INTEL is incompatible with libgomp.so.1 library." + steps: + - name: Checkout code + uses: actions/checkout@v4 + - name: Download wheels + uses: actions/download-artifact@v4 + with: + name: resiliency-wheels + path: ./dist/ + - name: Set up environment + run: | + pip install pytest lightning + PY_VER_NODOT=$(python -c"import sysconfig; print(sysconfig.get_config_var('py_version_nodot'))") + pip install ./dist/nvidia_resiliency_ext-*-cp${PY_VER_NODOT}-*.whl + - name: Run unit tests + shell: bash + run: | + if [[ "${{ matrix.test_type }}" == "straggler" ]]; then + pytest -s -vvv -m "not gpu" ./tests/straggler/unit/ + exit 0 + elif [[ "${{ matrix.test_type }}" == "ptl_resiliency" ]]; then + pytest -s -vvv -m "not gpu" ./tests/ptl_resiliency/unit/ + exit 0 + elif [[ "${{ matrix.test_type }}" == "fault_tolerance" ]]; then + pytest -s -vvv -m "not gpu" ./tests/fault_tolerance/unit/ + exit 0 + else + echo "Unknown test type: ${{ matrix.test_type }}" + exit 1 + fi diff --git a/Dockerfile.builder b/Dockerfile.builder deleted file mode 100644 index b128b263..00000000 --- a/Dockerfile.builder +++ /dev/null @@ -1,43 +0,0 @@ -# This image purpose is to build "nvidia_resiliency_ext" wheels using different Python versions. -# There are python3.10, python3.11 and python3.12 installed. -# Base image is CUDA, as Straggler Detection package uses CUPTI. -# Wheel for Python3.10 can be created with "python3.10 -m build --wheel" etc. - -# Choose a base CUDA image from NVIDIA -# nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04, nvidia/cuda:12.4.1-cudnn-devel-ubuntu20.04 etc. -ARG BASE_CUDA_IMG=nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 -FROM ${BASE_CUDA_IMG} - -# Set environment variables to non-interactive to avoid prompts during package installation -ENV DEBIAN_FRONTEND=noninteractive - -# Repo with Pythons -RUN apt update && apt install -y software-properties-common && add-apt-repository ppa:deadsnakes/ppa - -# Install common dependencies -RUN apt-get update && apt-get install -y \ - python3.10 python3.10-dev python3.10-distutils \ - python3.11 python3.11-dev python3.11-distutils \ - python3.12 python3.12-dev python3.12-distutils \ - wget curl build-essential gcc-10 g++-10\ - && rm -rf /var/lib/apt/lists/* - -# Install pip for each Python version -RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10 && \ - curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11 && \ - curl -sS https://bootstrap.pypa.io/get-pip.py | python3.12 - -# Install deps, -# FIXME: for some reason six needs to be manually updated -# otherwise wheel building fails with: ModuleNotFoundError: No module named 'six' -RUN python3.10 -m pip install build poetry && \ - python3.11 -m pip install build poetry && \ - python3.12 -m pip install build poetry && \ - python3.10 -m pip install -U six && \ - python3.11 -m pip install -U six && \ - python3.12 -m pip install -U six - -# Set the working directory -WORKDIR /workspace - -ENTRYPOINT ["/bin/bash", "-c"] diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 00000000..eb606adf --- /dev/null +++ b/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +markers = + gpu: tests that require GPU diff --git a/tests/fault_tolerance/unit/test_launcher.py b/tests/fault_tolerance/unit/test_launcher.py index 6ac842a4..050a7071 100644 --- a/tests/fault_tolerance/unit/test_launcher.py +++ b/tests/fault_tolerance/unit/test_launcher.py @@ -47,7 +47,7 @@ def _run_launcher(cmd_to_run, timeout): cmd_to_run, shell=True, stdout=subprocess.PIPE, - stderr=subprocess.DEVNULL, + stderr=subprocess.PIPE, text=True, ) stdout, _ = proc.communicate(timeout=timeout) @@ -79,13 +79,13 @@ def test_rank_not_send_initial_hb(tmp_dir): ft_cfg_path = _save_ft_cfg(ft_cfg, tmp_dir) cmd_to_run = f"{_get_util_script_path()} --scenario={_get_func_name()} --which_rank=1" launcher_cmd = ( - "ft_launcher --monitor-interval=1" + "PYTHONFAULTHANDLER=1 ft_launcher --monitor-interval=1" f" --fault-tol-cfg-path={ft_cfg_path} --nproc-per-node={WORLD_SIZE} {cmd_to_run}" ) ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT) assert "ALL RANKS STARTED" in output assert "RANK IS SKIPPING INITIAL HB" in output - assert ret_code == 1 + assert ret_code == 1, f"Launcher should return with 1. Ret value={ret_code}. Output=\n{output}" def test_rank_failed(tmp_dir): @@ -98,13 +98,13 @@ def test_rank_failed(tmp_dir): ft_cfg_path = _save_ft_cfg(ft_cfg, tmp_dir) cmd_to_run = f"{_get_util_script_path()} --scenario={_get_func_name()} --which_rank=1" launcher_cmd = ( - "ft_launcher --monitor-interval=1" + "PYTHONFAULTHANDLER=1 ft_launcher --monitor-interval=1" f" --fault-tol-cfg-path={ft_cfg_path} --nproc-per-node={WORLD_SIZE} {cmd_to_run}" ) ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT) assert "ALL RANKS STARTED" in output assert "RANK FAILED" in output - assert ret_code == 1 + assert ret_code == 1, f"Launcher should return with 1. Ret value={ret_code}. Output=\n{output}" def test_ranks_exit_gracefully(tmp_dir): @@ -116,13 +116,13 @@ def test_ranks_exit_gracefully(tmp_dir): ft_cfg_path = _save_ft_cfg(ft_cfg, tmp_dir) cmd_to_run = f"{_get_util_script_path()} --scenario={_get_func_name()}" launcher_cmd = ( - "ft_launcher --monitor-interval=1" + "PYTHONFAULTHANDLER=1 ft_launcher --monitor-interval=1" f" --fault-tol-cfg-path={ft_cfg_path} --nproc-per-node={WORLD_SIZE} {cmd_to_run}" ) ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT) assert "ALL RANKS STARTED" in output assert "RANK EXITS GRACEFULLY" in output - assert ret_code == 0 + assert ret_code == 0, f"Launcher should return with 0. Ret value={ret_code}. Output=\n{output}" def test_launcher_sigterm_graceful_exit(tmp_dir): @@ -136,14 +136,14 @@ def test_launcher_sigterm_graceful_exit(tmp_dir): ft_cfg_path = _save_ft_cfg(ft_cfg, tmp_dir) cmd_to_run = f"{_get_util_script_path()} --scenario={_get_func_name()} --term_handler=return0" launcher_cmd = ( - "ft_launcher --monitor-interval=1" + "PYTHONFAULTHANDLER=1 ft_launcher --monitor-interval=1" f" --fault-tol-cfg-path={ft_cfg_path} --nproc-per-node={WORLD_SIZE} {cmd_to_run}" ) ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT) assert "ALL RANKS STARTED" in output assert "SIGTERM SENT TO LAUNCHER" in output assert "RANK GOT SIGTERM: RETURN0" in output - assert ret_code == 0 + assert ret_code == 0, f"Launcher should return with 0. Ret value={ret_code}. Output=\n{output}" def test_launcher_sigterm_ignored(tmp_dir): @@ -157,14 +157,14 @@ def test_launcher_sigterm_ignored(tmp_dir): ft_cfg_path = _save_ft_cfg(ft_cfg, tmp_dir) cmd_to_run = f"{_get_util_script_path()} --scenario={_get_func_name()} --term_handler=ignore" launcher_cmd = ( - "ft_launcher --term-timeout=5 --monitor-interval=1" + "PYTHONFAULTHANDLER=1 ft_launcher --term-timeout=5 --monitor-interval=1" f" --fault-tol-cfg-path={ft_cfg_path} --nproc-per-node={WORLD_SIZE} {cmd_to_run}" ) ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT) assert "ALL RANKS STARTED" in output assert "SIGTERM SENT TO LAUNCHER" in output assert "RANK GOT SIGTERM: IGNORED" in output - assert ret_code == 1 + assert ret_code == 1, f"Launcher should return with 1. Ret value={ret_code}. Output=\n{output}" def test_ranks_restart(tmp_dir): @@ -178,7 +178,7 @@ def test_ranks_restart(tmp_dir): ft_cfg_path = _save_ft_cfg(ft_cfg, tmp_dir) cmd_to_run = f"{_get_util_script_path()} --scenario={_get_func_name()} --tmp_dir={tmp_dir}" launcher_cmd = ( - "ft_launcher --max-restarts=2 --monitor-interval=1" + "PYTHONFAULTHANDLER=1 ft_launcher --max-restarts=2 --monitor-interval=1" f" --fault-tol-cfg-path={ft_cfg_path} --nproc-per-node={WORLD_SIZE} {cmd_to_run}" ) ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT) @@ -189,7 +189,7 @@ def test_ranks_restart(tmp_dir): assert "RANK FAILED" in output assert "RESTART #2" in output assert "RANK EXITS GRACEFULLY" in output - assert ret_code == 0 + assert ret_code == 0, f"Launcher should return with 0. Ret value={ret_code}. Output=\n{output}" def test_missing_cfg(tmp_dir): @@ -200,40 +200,44 @@ def test_missing_cfg(tmp_dir): # By default, launcher should raise an error if FT config cant be read cmd_to_run = f"{_get_util_script_path()} --scenario=test_ranks_exit_gracefully" launcher_cmd = ( - "ft_launcher --monitor-interval=1" + "PYTHONFAULTHANDLER=1 ft_launcher --monitor-interval=1" f" --fault-tol-cfg-path={empty_ft_cfg_path} --nproc-per-node={WORLD_SIZE} {cmd_to_run}" ) ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT) - assert ret_code != 0 + assert ( + ret_code != 0 + ), f"Launcher should return with non 0. Ret value={ret_code}. Output=\n{output}" # Empty config file again, But this time there are FT args in CLI, so should be fine cmd_to_run = f"{_get_util_script_path()} --scenario=test_ranks_exit_gracefully" launcher_cmd = ( - "ft_launcher --monitor-interval=1" + "PYTHONFAULTHANDLER=1 ft_launcher --monitor-interval=1" f" --fault-tol-cfg-path={empty_ft_cfg_path} --nproc-per-node={WORLD_SIZE} --ft-param-rank_heartbeat_timeout=1.0" f" {cmd_to_run}" ) ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT) - assert ret_code == 0 + assert ret_code == 0, f"Launcher should return with 0. Ret value={ret_code}. Output=\n{output}" # Empty config file again, launcher run with `--ignore-missing-fault-tol-cfg` should use defaults cmd_to_run = f"{_get_util_script_path()} --scenario=test_ranks_exit_gracefully" launcher_cmd = ( - "ft_launcher --monitor-interval=1" + "PYTHONFAULTHANDLER=1 ft_launcher --monitor-interval=1" f" --fault-tol-cfg-path={empty_ft_cfg_path} --ignore-missing-fault-tol-cfg" f" --nproc-per-node={WORLD_SIZE} {cmd_to_run}" ) ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT) - assert ret_code == 0 + assert ret_code == 0, f"Launcher should return with 0. Ret value={ret_code}. Output=\n{output}" # Invalid config file path - should fail despite --ignore-missing-fault-tol-cfg and FT args specified via CLI cmd_to_run = f"{_get_util_script_path()} --scenario=test_ranks_exit_gracefully" launcher_cmd = ( - "ft_launcher --monitor-interval=1" + "PYTHONFAULTHANDLER=1 ft_launcher --monitor-interval=1" " --fault-tol-cfg-path=/not/there.yaml" " --ft-param-rank_heartbeat_timeout=1.0" f" --nproc-per-node={WORLD_SIZE} --ignore-missing-fault-tol-cfg" f" {cmd_to_run}" ) ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT) - assert ret_code != 0 + assert ( + ret_code != 0 + ), f"Launcher should return with non 0. Ret value={ret_code}. Output=\n{output}" def test_config_provided_via_cli(tmp_dir): @@ -246,9 +250,12 @@ def test_config_provided_via_cli(tmp_dir): " --ft-param-log_level=WARNING" ) cmd_to_run = f"{_get_util_script_path()} --scenario=dump_cfg --tmp_dir={tmp_dir}" - launcher_cmd = "ft_launcher" f" {ft_params_str} --nproc-per-node={WORLD_SIZE} {cmd_to_run}" + launcher_cmd = ( + "PYTHONFAULTHANDLER=1 ft_launcher" + f" {ft_params_str} --nproc-per-node={WORLD_SIZE} {cmd_to_run}" + ) ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT) - assert ret_code == 0 + assert ret_code == 0, f"Launcher should return with 0. Ret value={ret_code}. Output=\n{output}" dumped_ft_cfg_path = os.path.join(tmp_dir, "cfg_dump.yaml") assert os.path.exists(dumped_ft_cfg_path) @@ -282,11 +289,11 @@ def test_config_provided_via_cli_overwrites_yaml(tmp_dir): ) cmd_to_run = f"{_get_util_script_path()} --scenario=dump_cfg --tmp_dir={tmp_dir}" launcher_cmd = ( - "ft_launcher" + "PYTHONFAULTHANDLER=1 ft_launcher" f" {ft_params_str} --fault-tol-cfg-path={ft_cfg_path} --nproc-per-node={WORLD_SIZE} {cmd_to_run}" ) ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT) - assert ret_code == 0 + assert ret_code == 0, f"Launcher should return with 0. Ret value={ret_code}. Output=\n{output}" dumped_ft_cfg_path = os.path.join(tmp_dir, "cfg_dump.yaml") assert os.path.exists(dumped_ft_cfg_path) diff --git a/tests/fault_tolerance/unit/test_timeouts_calc.py b/tests/fault_tolerance/unit/test_timeouts_calc.py index 08f021b3..bb5e9868 100644 --- a/tests/fault_tolerance/unit/test_timeouts_calc.py +++ b/tests/fault_tolerance/unit/test_timeouts_calc.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import faulthandler import shutil import sys import tempfile @@ -50,6 +51,7 @@ def test_basic(): def _rank_main(*args, tmp_dir, **kwargs): + faulthandler.enable(file=sys.stderr) tc = TimeoutsCalc(start_time=0, safety_factor=2.0) rank = dist.get_rank() if rank in [1, 2]: diff --git a/tests/fault_tolerance/unit/test_update_state.py b/tests/fault_tolerance/unit/test_update_state.py index 85a48194..61071f63 100644 --- a/tests/fault_tolerance/unit/test_update_state.py +++ b/tests/fault_tolerance/unit/test_update_state.py @@ -19,8 +19,6 @@ from nvidia_resiliency_ext.fault_tolerance import dict_utils as ft_utils -torch.set_default_device("cuda") - def test_merge_state_dicts(): d1 = {} diff --git a/tests/fault_tolerance/unit/utils.py b/tests/fault_tolerance/unit/utils.py index bba16133..542bb948 100644 --- a/tests/fault_tolerance/unit/utils.py +++ b/tests/fault_tolerance/unit/utils.py @@ -14,6 +14,7 @@ # limitations under the License. import contextlib +import gc import os import socket import sys @@ -113,6 +114,10 @@ def distributed_worker( worker_fn(**kwargs) + # `destroy_process_group` hangs were observed in CI + # use GC collect and barrier to mitigate the issue + gc.collect() + torch.distributed.barrier() torch.distributed.destroy_process_group() sys.exit(0) diff --git a/tests/ptl_resiliency/unit/test_ft_callback.py b/tests/ptl_resiliency/unit/test_ft_callback.py index 98ae63f3..bce782b7 100644 --- a/tests/ptl_resiliency/unit/test_ft_callback.py +++ b/tests/ptl_resiliency/unit/test_ft_callback.py @@ -206,7 +206,7 @@ def _run_trainining( trainer = pl.Trainer( strategy='ddp', devices=1, - accelerator='gpu', + accelerator='cpu', logger=False, max_steps=max_steps, max_epochs=max_epochs, @@ -243,7 +243,7 @@ def _run_eval(tmp_path, which='not set'): trainer = pl.Trainer( strategy='ddp', devices=1, - accelerator='gpu', + accelerator='cpu', logger=False, callbacks=[fault_tol_cb, checkpoint_callback], ) diff --git a/tests/ptl_resiliency/unit/test_straggler_det_callback.py b/tests/ptl_resiliency/unit/test_straggler_det_callback.py index dfb48862..6cbdba79 100644 --- a/tests/ptl_resiliency/unit/test_straggler_det_callback.py +++ b/tests/ptl_resiliency/unit/test_straggler_det_callback.py @@ -26,6 +26,8 @@ from nvidia_resiliency_ext.ptl_resiliency import StragglerDetectionCallback +pytestmark = pytest.mark.gpu + class OnesDataset(torch.utils.data.Dataset): def __init__(self, dataset_len): diff --git a/tests/straggler/unit/_utils.py b/tests/straggler/unit/_utils.py index bba16133..542bb948 100644 --- a/tests/straggler/unit/_utils.py +++ b/tests/straggler/unit/_utils.py @@ -14,6 +14,7 @@ # limitations under the License. import contextlib +import gc import os import socket import sys @@ -113,6 +114,10 @@ def distributed_worker( worker_fn(**kwargs) + # `destroy_process_group` hangs were observed in CI + # use GC collect and barrier to mitigate the issue + gc.collect() + torch.distributed.barrier() torch.distributed.destroy_process_group() sys.exit(0) diff --git a/tests/straggler/unit/test_cupti_ext.py b/tests/straggler/unit/test_cupti_ext.py index bc954151..5f4a4dca 100644 --- a/tests/straggler/unit/test_cupti_ext.py +++ b/tests/straggler/unit/test_cupti_ext.py @@ -19,6 +19,8 @@ from nvidia_resiliency_ext.straggler import cupti_module +pytestmark = pytest.mark.gpu + def test_basic_kernel_tracking(): cupti_ext = cupti_module.CuptiProfiler() diff --git a/tests/straggler/unit/test_cupti_manager.py b/tests/straggler/unit/test_cupti_manager.py index e94c2427..5d88f81c 100644 --- a/tests/straggler/unit/test_cupti_manager.py +++ b/tests/straggler/unit/test_cupti_manager.py @@ -18,6 +18,8 @@ from nvidia_resiliency_ext.straggler.cupti import CuptiManager +pytestmark = pytest.mark.gpu + def test_cupti_manager_start_stop(): cupti_mgr = CuptiManager() diff --git a/tests/straggler/unit/test_det_section_api.py b/tests/straggler/unit/test_det_section_api.py index cf876f89..123763c5 100644 --- a/tests/straggler/unit/test_det_section_api.py +++ b/tests/straggler/unit/test_det_section_api.py @@ -20,6 +20,8 @@ from nvidia_resiliency_ext import straggler +pytestmark = pytest.mark.gpu + @pytest.fixture def _straggler_init_shutdown(): diff --git a/tests/straggler/unit/test_interval_tracker.py b/tests/straggler/unit/test_interval_tracker.py index 5e18403c..14cdbda1 100644 --- a/tests/straggler/unit/test_interval_tracker.py +++ b/tests/straggler/unit/test_interval_tracker.py @@ -15,8 +15,13 @@ import time +import pytest + from nvidia_resiliency_ext.straggler import interval_tracker +# FIXME: should work without GPU as well +pytestmark = pytest.mark.gpu + def test_estimate(): diff --git a/tests/straggler/unit/test_reporting.py b/tests/straggler/unit/test_reporting.py index 00e3c3a6..3efbd0f1 100644 --- a/tests/straggler/unit/test_reporting.py +++ b/tests/straggler/unit/test_reporting.py @@ -25,6 +25,8 @@ from nvidia_resiliency_ext import straggler +pytestmark = pytest.mark.gpu + class Layer(nn.Module): def __init__(self, in_features, out_features, bias): diff --git a/tests/straggler/unit/test_reporting_elapsed.py b/tests/straggler/unit/test_reporting_elapsed.py index 6827910d..c3a92c94 100644 --- a/tests/straggler/unit/test_reporting_elapsed.py +++ b/tests/straggler/unit/test_reporting_elapsed.py @@ -24,6 +24,8 @@ from nvidia_resiliency_ext import straggler +pytestmark = pytest.mark.gpu + class Layer(nn.Module): def __init__(self, in_features, out_features, bias): diff --git a/tests/straggler/unit/test_sections.py b/tests/straggler/unit/test_sections.py index 63c2e3ee..842a2fa5 100644 --- a/tests/straggler/unit/test_sections.py +++ b/tests/straggler/unit/test_sections.py @@ -24,6 +24,9 @@ from ._utils import multiprocessing_execute_join, multiprocessing_execute_start +# FIXME: should work without GPU as well +pytestmark = pytest.mark.gpu + # This is a basic test of custom sections: # - Run distributed dummy workload, where each rank has 3 custom sections # - Each rank executes the sections in a loop diff --git a/tests/straggler/unit/test_wrap_callables.py b/tests/straggler/unit/test_wrap_callables.py index 3025d3b2..ff75297f 100644 --- a/tests/straggler/unit/test_wrap_callables.py +++ b/tests/straggler/unit/test_wrap_callables.py @@ -24,6 +24,8 @@ from ._utils import multiprocessing_execute_join, multiprocessing_execute_start +pytestmark = pytest.mark.gpu + TEST_WORLD_SIZE = 4 ALL_RANK_IDS = set(range(TEST_WORLD_SIZE)) RANK_DONE_TIMEOUT = 30