Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
f8e8892
Make FT unit tests CPU only
jbieniusiewi Oct 15, 2024
b619dd8
Made FT callback unit tests CPU compatible
jbieniusiewi Oct 15, 2024
e701ddf
Mark GPU-only straggler tests
jbieniusiewi Oct 15, 2024
9323e1c
applied pre-commit hooks
jbieniusiewi Oct 15, 2024
0cf339e
Mark straggler PTL callback tests as GPU
jbieniusiewi Oct 15, 2024
c8c6087
Added unit test workflow
jbieniusiewi Oct 15, 2024
3a6a969
Working on unit tests workflow YAML, wip
jbieniusiewi Oct 15, 2024
cd186c4
Removed not used Dockerfile.builder
jbieniusiewi Oct 15, 2024
0c5f5c9
Updated GH actions
jbieniusiewi Oct 15, 2024
cc18e83
More work on unit test YAML ...
jbieniusiewi Oct 15, 2024
ae07fe5
More PyTorch images for testing.
jbieniusiewi Oct 15, 2024
39f257d
Restored interval_tracker.py
jbieniusiewi Oct 15, 2024
289d637
commented out older pyt
jbieniusiewi Oct 15, 2024
3f7a0c8
shows launcher test stderr
jbieniusiewi Oct 15, 2024
f65878d
More logs in the launcher test
jbieniusiewi Oct 15, 2024
67220a2
Restored older PyTorches
jbieniusiewi Oct 15, 2024
d5365dc
Added faulthandler for debugging
jbieniusiewi Oct 15, 2024
6b4650d
cleaner launcher test logs
jbieniusiewi Oct 15, 2024
f30c73e
timeouts bump
jbieniusiewi Oct 15, 2024
761aec6
removed debug code from yaml
jbieniusiewi Oct 15, 2024
fe3b84b
timeouts bump #2
jbieniusiewi Oct 15, 2024
3c739a2
split unit tests into separate subtasks
srogawski-nvidia Oct 16, 2024
3754bd9
remove yaml anchors - not supported
srogawski-nvidia Oct 16, 2024
8b064d7
define bash as default shell for tests
srogawski-nvidia Oct 16, 2024
5ba0eca
add reruns, stop after first fail, and parallel runs
srogawski-nvidia Oct 16, 2024
90fb138
remove parallelism
srogawski-nvidia Oct 16, 2024
7d9ae26
Add an explicit exit to ensure termination
srogawski-nvidia Oct 16, 2024
20525df
change order of tests
srogawski-nvidia Oct 16, 2024
38c2ccb
Add dbg trigger
jbieniusiewi Oct 16, 2024
4941376
Throwaway dbg commit
jbieniusiewi Oct 16, 2024
da07642
version with fixed dist group destroy
jbieniusiewi Oct 16, 2024
bef59a1
select all straggler tests
jbieniusiewi Oct 16, 2024
a4574b8
merged dist group shutdown fix
jbieniusiewi Oct 16, 2024
d0d3fbd
restored timeouts
jbieniusiewi Oct 16, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/build_docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:

steps:
- name: Checkout repository
uses: actions/checkout@v3
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v4
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/lint_code.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ jobs:

steps:
- name: Checkout repository
uses: actions/checkout@v3
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v4
Expand Down
94 changes: 94 additions & 0 deletions .github/workflows/unit_test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
name: Run Unit Tests
on:
push:
branches:
- main
pull_request:
branches:
- main

jobs:

build_wheels:
runs-on: ubuntu-24.04
container:
image: 'nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04'
steps:
- name: Update GCC
run: |
export DEBIAN_FRONTEND=noninteractive
apt update && apt install -y build-essential gcc-10 g++-10
- name: Install Python versions and pips
run: |
export DEBIAN_FRONTEND=noninteractive
apt update && apt install -y software-properties-common curl
add-apt-repository ppa:deadsnakes/ppa
apt-get install -y python3.10 python3.10-dev python3.10-distutils
apt-get install -y python3.11 python3.11-dev python3.11-distutils
apt-get install -y python3.12 python3.12-dev python3.12-distutils
curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10
curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11
curl -sS https://bootstrap.pypa.io/get-pip.py | python3.12
- name: Checkout code
uses: actions/checkout@v4
- name: Build wheel with Python 3.10
run: |
python3.10 -m pip install -U poetry build six
python3.10 -m poetry build -f wheel
- name: Build wheel with Python 3.11
run: |
python3.11 -m pip install -U poetry build six
python3.11 -m poetry build -f wheel
- name: Build wheel with Python 3.12
run: |
python3.12 -m pip install -U poetry build six
python3.12 -m poetry build -f wheel
- name: Upload the wheel artifact
uses: actions/upload-artifact@v4
with:
name: resiliency-wheels
path: dist/*.whl

unit_tests:
runs-on: ubuntu-24.04
needs: build_wheels
strategy:
matrix:
container:
- 'pytorch/pytorch:2.4.1-cuda12.1-cudnn9-runtime'
- 'pytorch/pytorch:2.2.2-cuda12.1-cudnn8-runtime'
- 'pytorch/pytorch:2.1.0-cuda11.8-cudnn8-runtime'
test_type: ['fault_tolerance', 'straggler', 'ptl_resiliency']
container:
image: ${{ matrix.container }}
env:
MKL_SERVICE_FORCE_INTEL: 1 # Fix for "MKL_THREADING_LAYER=INTEL is incompatible with libgomp.so.1 library."
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Download wheels
uses: actions/download-artifact@v4
with:
name: resiliency-wheels
path: ./dist/
- name: Set up environment
run: |
pip install pytest lightning
PY_VER_NODOT=$(python -c"import sysconfig; print(sysconfig.get_config_var('py_version_nodot'))")
pip install ./dist/nvidia_resiliency_ext-*-cp${PY_VER_NODOT}-*.whl
- name: Run unit tests
shell: bash
run: |
if [[ "${{ matrix.test_type }}" == "straggler" ]]; then
pytest -s -vvv -m "not gpu" ./tests/straggler/unit/
exit 0
elif [[ "${{ matrix.test_type }}" == "ptl_resiliency" ]]; then
pytest -s -vvv -m "not gpu" ./tests/ptl_resiliency/unit/
exit 0
elif [[ "${{ matrix.test_type }}" == "fault_tolerance" ]]; then
pytest -s -vvv -m "not gpu" ./tests/fault_tolerance/unit/
exit 0
else
echo "Unknown test type: ${{ matrix.test_type }}"
exit 1
fi
43 changes: 0 additions & 43 deletions Dockerfile.builder

This file was deleted.

3 changes: 3 additions & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[pytest]
markers =
gpu: tests that require GPU
57 changes: 32 additions & 25 deletions tests/fault_tolerance/unit/test_launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def _run_launcher(cmd_to_run, timeout):
cmd_to_run,
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.DEVNULL,
stderr=subprocess.PIPE,
text=True,
)
stdout, _ = proc.communicate(timeout=timeout)
Expand Down Expand Up @@ -79,13 +79,13 @@ def test_rank_not_send_initial_hb(tmp_dir):
ft_cfg_path = _save_ft_cfg(ft_cfg, tmp_dir)
cmd_to_run = f"{_get_util_script_path()} --scenario={_get_func_name()} --which_rank=1"
launcher_cmd = (
"ft_launcher --monitor-interval=1"
"PYTHONFAULTHANDLER=1 ft_launcher --monitor-interval=1"
f" --fault-tol-cfg-path={ft_cfg_path} --nproc-per-node={WORLD_SIZE} {cmd_to_run}"
)
ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT)
assert "ALL RANKS STARTED" in output
assert "RANK IS SKIPPING INITIAL HB" in output
assert ret_code == 1
assert ret_code == 1, f"Launcher should return with 1. Ret value={ret_code}. Output=\n{output}"


def test_rank_failed(tmp_dir):
Expand All @@ -98,13 +98,13 @@ def test_rank_failed(tmp_dir):
ft_cfg_path = _save_ft_cfg(ft_cfg, tmp_dir)
cmd_to_run = f"{_get_util_script_path()} --scenario={_get_func_name()} --which_rank=1"
launcher_cmd = (
"ft_launcher --monitor-interval=1"
"PYTHONFAULTHANDLER=1 ft_launcher --monitor-interval=1"
f" --fault-tol-cfg-path={ft_cfg_path} --nproc-per-node={WORLD_SIZE} {cmd_to_run}"
)
ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT)
assert "ALL RANKS STARTED" in output
assert "RANK FAILED" in output
assert ret_code == 1
assert ret_code == 1, f"Launcher should return with 1. Ret value={ret_code}. Output=\n{output}"


def test_ranks_exit_gracefully(tmp_dir):
Expand All @@ -116,13 +116,13 @@ def test_ranks_exit_gracefully(tmp_dir):
ft_cfg_path = _save_ft_cfg(ft_cfg, tmp_dir)
cmd_to_run = f"{_get_util_script_path()} --scenario={_get_func_name()}"
launcher_cmd = (
"ft_launcher --monitor-interval=1"
"PYTHONFAULTHANDLER=1 ft_launcher --monitor-interval=1"
f" --fault-tol-cfg-path={ft_cfg_path} --nproc-per-node={WORLD_SIZE} {cmd_to_run}"
)
ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT)
assert "ALL RANKS STARTED" in output
assert "RANK EXITS GRACEFULLY" in output
assert ret_code == 0
assert ret_code == 0, f"Launcher should return with 0. Ret value={ret_code}. Output=\n{output}"


def test_launcher_sigterm_graceful_exit(tmp_dir):
Expand All @@ -136,14 +136,14 @@ def test_launcher_sigterm_graceful_exit(tmp_dir):
ft_cfg_path = _save_ft_cfg(ft_cfg, tmp_dir)
cmd_to_run = f"{_get_util_script_path()} --scenario={_get_func_name()} --term_handler=return0"
launcher_cmd = (
"ft_launcher --monitor-interval=1"
"PYTHONFAULTHANDLER=1 ft_launcher --monitor-interval=1"
f" --fault-tol-cfg-path={ft_cfg_path} --nproc-per-node={WORLD_SIZE} {cmd_to_run}"
)
ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT)
assert "ALL RANKS STARTED" in output
assert "SIGTERM SENT TO LAUNCHER" in output
assert "RANK GOT SIGTERM: RETURN0" in output
assert ret_code == 0
assert ret_code == 0, f"Launcher should return with 0. Ret value={ret_code}. Output=\n{output}"


def test_launcher_sigterm_ignored(tmp_dir):
Expand All @@ -157,14 +157,14 @@ def test_launcher_sigterm_ignored(tmp_dir):
ft_cfg_path = _save_ft_cfg(ft_cfg, tmp_dir)
cmd_to_run = f"{_get_util_script_path()} --scenario={_get_func_name()} --term_handler=ignore"
launcher_cmd = (
"ft_launcher --term-timeout=5 --monitor-interval=1"
"PYTHONFAULTHANDLER=1 ft_launcher --term-timeout=5 --monitor-interval=1"
f" --fault-tol-cfg-path={ft_cfg_path} --nproc-per-node={WORLD_SIZE} {cmd_to_run}"
)
ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT)
assert "ALL RANKS STARTED" in output
assert "SIGTERM SENT TO LAUNCHER" in output
assert "RANK GOT SIGTERM: IGNORED" in output
assert ret_code == 1
assert ret_code == 1, f"Launcher should return with 1. Ret value={ret_code}. Output=\n{output}"


def test_ranks_restart(tmp_dir):
Expand All @@ -178,7 +178,7 @@ def test_ranks_restart(tmp_dir):
ft_cfg_path = _save_ft_cfg(ft_cfg, tmp_dir)
cmd_to_run = f"{_get_util_script_path()} --scenario={_get_func_name()} --tmp_dir={tmp_dir}"
launcher_cmd = (
"ft_launcher --max-restarts=2 --monitor-interval=1"
"PYTHONFAULTHANDLER=1 ft_launcher --max-restarts=2 --monitor-interval=1"
f" --fault-tol-cfg-path={ft_cfg_path} --nproc-per-node={WORLD_SIZE} {cmd_to_run}"
)
ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT)
Expand All @@ -189,7 +189,7 @@ def test_ranks_restart(tmp_dir):
assert "RANK FAILED" in output
assert "RESTART #2" in output
assert "RANK EXITS GRACEFULLY" in output
assert ret_code == 0
assert ret_code == 0, f"Launcher should return with 0. Ret value={ret_code}. Output=\n{output}"


def test_missing_cfg(tmp_dir):
Expand All @@ -200,40 +200,44 @@ def test_missing_cfg(tmp_dir):
# By default, launcher should raise an error if FT config cant be read
cmd_to_run = f"{_get_util_script_path()} --scenario=test_ranks_exit_gracefully"
launcher_cmd = (
"ft_launcher --monitor-interval=1"
"PYTHONFAULTHANDLER=1 ft_launcher --monitor-interval=1"
f" --fault-tol-cfg-path={empty_ft_cfg_path} --nproc-per-node={WORLD_SIZE} {cmd_to_run}"
)
ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT)
assert ret_code != 0
assert (
ret_code != 0
), f"Launcher should return with non 0. Ret value={ret_code}. Output=\n{output}"
# Empty config file again, But this time there are FT args in CLI, so should be fine
cmd_to_run = f"{_get_util_script_path()} --scenario=test_ranks_exit_gracefully"
launcher_cmd = (
"ft_launcher --monitor-interval=1"
"PYTHONFAULTHANDLER=1 ft_launcher --monitor-interval=1"
f" --fault-tol-cfg-path={empty_ft_cfg_path} --nproc-per-node={WORLD_SIZE} --ft-param-rank_heartbeat_timeout=1.0"
f" {cmd_to_run}"
)
ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT)
assert ret_code == 0
assert ret_code == 0, f"Launcher should return with 0. Ret value={ret_code}. Output=\n{output}"
# Empty config file again, launcher run with `--ignore-missing-fault-tol-cfg` should use defaults
cmd_to_run = f"{_get_util_script_path()} --scenario=test_ranks_exit_gracefully"
launcher_cmd = (
"ft_launcher --monitor-interval=1"
"PYTHONFAULTHANDLER=1 ft_launcher --monitor-interval=1"
f" --fault-tol-cfg-path={empty_ft_cfg_path} --ignore-missing-fault-tol-cfg"
f" --nproc-per-node={WORLD_SIZE} {cmd_to_run}"
)
ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT)
assert ret_code == 0
assert ret_code == 0, f"Launcher should return with 0. Ret value={ret_code}. Output=\n{output}"
# Invalid config file path - should fail despite --ignore-missing-fault-tol-cfg and FT args specified via CLI
cmd_to_run = f"{_get_util_script_path()} --scenario=test_ranks_exit_gracefully"
launcher_cmd = (
"ft_launcher --monitor-interval=1"
"PYTHONFAULTHANDLER=1 ft_launcher --monitor-interval=1"
" --fault-tol-cfg-path=/not/there.yaml"
" --ft-param-rank_heartbeat_timeout=1.0"
f" --nproc-per-node={WORLD_SIZE} --ignore-missing-fault-tol-cfg"
f" {cmd_to_run}"
)
ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT)
assert ret_code != 0
assert (
ret_code != 0
), f"Launcher should return with non 0. Ret value={ret_code}. Output=\n{output}"


def test_config_provided_via_cli(tmp_dir):
Expand All @@ -246,9 +250,12 @@ def test_config_provided_via_cli(tmp_dir):
" --ft-param-log_level=WARNING"
)
cmd_to_run = f"{_get_util_script_path()} --scenario=dump_cfg --tmp_dir={tmp_dir}"
launcher_cmd = "ft_launcher" f" {ft_params_str} --nproc-per-node={WORLD_SIZE} {cmd_to_run}"
launcher_cmd = (
"PYTHONFAULTHANDLER=1 ft_launcher"
f" {ft_params_str} --nproc-per-node={WORLD_SIZE} {cmd_to_run}"
)
ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT)
assert ret_code == 0
assert ret_code == 0, f"Launcher should return with 0. Ret value={ret_code}. Output=\n{output}"

dumped_ft_cfg_path = os.path.join(tmp_dir, "cfg_dump.yaml")
assert os.path.exists(dumped_ft_cfg_path)
Expand Down Expand Up @@ -282,11 +289,11 @@ def test_config_provided_via_cli_overwrites_yaml(tmp_dir):
)
cmd_to_run = f"{_get_util_script_path()} --scenario=dump_cfg --tmp_dir={tmp_dir}"
launcher_cmd = (
"ft_launcher"
"PYTHONFAULTHANDLER=1 ft_launcher"
f" {ft_params_str} --fault-tol-cfg-path={ft_cfg_path} --nproc-per-node={WORLD_SIZE} {cmd_to_run}"
)
ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT)
assert ret_code == 0
assert ret_code == 0, f"Launcher should return with 0. Ret value={ret_code}. Output=\n{output}"

dumped_ft_cfg_path = os.path.join(tmp_dir, "cfg_dump.yaml")
assert os.path.exists(dumped_ft_cfg_path)
Expand Down
2 changes: 2 additions & 0 deletions tests/fault_tolerance/unit/test_timeouts_calc.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import faulthandler
import shutil
import sys
import tempfile
Expand Down Expand Up @@ -50,6 +51,7 @@ def test_basic():


def _rank_main(*args, tmp_dir, **kwargs):
faulthandler.enable(file=sys.stderr)
tc = TimeoutsCalc(start_time=0, safety_factor=2.0)
rank = dist.get_rank()
if rank in [1, 2]:
Expand Down
2 changes: 0 additions & 2 deletions tests/fault_tolerance/unit/test_update_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,6 @@

from nvidia_resiliency_ext.fault_tolerance import dict_utils as ft_utils

torch.set_default_device("cuda")


def test_merge_state_dicts():
d1 = {}
Expand Down
5 changes: 5 additions & 0 deletions tests/fault_tolerance/unit/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
# limitations under the License.

import contextlib
import gc
import os
import socket
import sys
Expand Down Expand Up @@ -113,6 +114,10 @@ def distributed_worker(

worker_fn(**kwargs)

# `destroy_process_group` hangs were observed in CI
# use GC collect and barrier to mitigate the issue
gc.collect()
torch.distributed.barrier()
torch.distributed.destroy_process_group()

sys.exit(0)
Expand Down
Loading
Loading