NVIDIA · jbieniusiewi · Oct 16, 2024 · Oct 15, 2024 · Oct 15, 2024 · Oct 15, 2024
diff --git a/.github/workflows/build_docs.yml b/.github/workflows/build_docs.yml
@@ -15,7 +15,7 @@ jobs:
 
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Set up Python
         uses: actions/setup-python@v4

diff --git a/.github/workflows/lint_code.yml b/.github/workflows/lint_code.yml
@@ -14,7 +14,7 @@ jobs:
 
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Set up Python
         uses: actions/setup-python@v4

diff --git a/.github/workflows/unit_test.yml b/.github/workflows/unit_test.yml
@@ -0,0 +1,94 @@
+name: Run Unit Tests
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+jobs:
+
+  build_wheels:
+    runs-on: ubuntu-24.04
+    container:
+      image: 'nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04'
+    steps:
+      - name: Update GCC
+        run: |
+          export DEBIAN_FRONTEND=noninteractive
+          apt update && apt install -y build-essential gcc-10 g++-10
+      - name: Install Python versions and pips
+        run: |
+          export DEBIAN_FRONTEND=noninteractive
+          apt update && apt install -y software-properties-common curl
+          add-apt-repository ppa:deadsnakes/ppa
+          apt-get install -y python3.10 python3.10-dev python3.10-distutils
+          apt-get install -y python3.11 python3.11-dev python3.11-distutils
+          apt-get install -y python3.12 python3.12-dev python3.12-distutils
+          curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10
+          curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11
+          curl -sS https://bootstrap.pypa.io/get-pip.py | python3.12
+      - name: Checkout code
+        uses: actions/checkout@v4
+      - name: Build wheel with Python 3.10
+        run: |
+          python3.10 -m pip install -U poetry build six
+          python3.10 -m poetry build -f wheel
+      - name: Build wheel with Python 3.11
+        run: |
+          python3.11 -m pip install -U poetry build six
+          python3.11 -m poetry build -f wheel
+      - name: Build wheel with Python 3.12
+        run: |
+          python3.12 -m pip install -U poetry build six
+          python3.12 -m poetry build -f wheel
+      - name: Upload the wheel artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: resiliency-wheels
+          path: dist/*.whl
+
+  unit_tests:
+    runs-on: ubuntu-24.04
+    needs: build_wheels
+    strategy:
+      matrix:
+        container:
+          - 'pytorch/pytorch:2.4.1-cuda12.1-cudnn9-runtime'
+          - 'pytorch/pytorch:2.2.2-cuda12.1-cudnn8-runtime'
+          - 'pytorch/pytorch:2.1.0-cuda11.8-cudnn8-runtime'
+        test_type: ['fault_tolerance', 'straggler', 'ptl_resiliency']
+    container:
+      image: ${{ matrix.container }}
+    env:
+        MKL_SERVICE_FORCE_INTEL: 1 # Fix for "MKL_THREADING_LAYER=INTEL is incompatible with libgomp.so.1 library."
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+      - name: Download wheels
+        uses: actions/download-artifact@v4
+        with:
+          name: resiliency-wheels
+          path: ./dist/
+      - name: Set up environment
+        run: |
+          pip install pytest lightning
+          PY_VER_NODOT=$(python -c"import sysconfig; print(sysconfig.get_config_var('py_version_nodot'))")
+          pip install ./dist/nvidia_resiliency_ext-*-cp${PY_VER_NODOT}-*.whl
+      - name: Run unit tests
+        shell: bash
+        run: |
+          if [[ "${{ matrix.test_type }}" == "straggler" ]]; then
+            pytest -s -vvv -m "not gpu" ./tests/straggler/unit/
+            exit 0
+          elif [[ "${{ matrix.test_type }}" == "ptl_resiliency" ]]; then
+            pytest -s -vvv -m "not gpu" ./tests/ptl_resiliency/unit/
+            exit 0
+          elif [[ "${{ matrix.test_type }}" == "fault_tolerance" ]]; then
+            pytest -s -vvv -m "not gpu" ./tests/fault_tolerance/unit/
+            exit 0
+          else
+            echo "Unknown test type: ${{ matrix.test_type }}"
+            exit 1
+          fi
diff --git a/Dockerfile.builder b/Dockerfile.builder
diff --git a/pytest.ini b/pytest.ini
@@ -0,0 +1,3 @@
+[pytest]
+markers =
+    gpu: tests that require GPU
diff --git a/tests/fault_tolerance/unit/test_launcher.py b/tests/fault_tolerance/unit/test_launcher.py
@@ -47,7 +47,7 @@ def _run_launcher(cmd_to_run, timeout):
             cmd_to_run,
             shell=True,
             stdout=subprocess.PIPE,
-            stderr=subprocess.DEVNULL,
+            stderr=subprocess.PIPE,
             text=True,
         )
         stdout, _ = proc.communicate(timeout=timeout)
@@ -79,13 +79,13 @@ def test_rank_not_send_initial_hb(tmp_dir):
     ft_cfg_path = _save_ft_cfg(ft_cfg, tmp_dir)
     cmd_to_run = f"{_get_util_script_path()} --scenario={_get_func_name()} --which_rank=1"
     launcher_cmd = (
-        "ft_launcher --monitor-interval=1"
+        "PYTHONFAULTHANDLER=1 ft_launcher --monitor-interval=1"
         f" --fault-tol-cfg-path={ft_cfg_path} --nproc-per-node={WORLD_SIZE} {cmd_to_run}"
     )
     ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT)
     assert "ALL RANKS STARTED" in output
     assert "RANK IS SKIPPING INITIAL HB" in output
-    assert ret_code == 1
+    assert ret_code == 1, f"Launcher should return with 1. Ret value={ret_code}. Output=\n{output}"
 
 
 def test_rank_failed(tmp_dir):
@@ -98,13 +98,13 @@ def test_rank_failed(tmp_dir):
     ft_cfg_path = _save_ft_cfg(ft_cfg, tmp_dir)
     cmd_to_run = f"{_get_util_script_path()} --scenario={_get_func_name()} --which_rank=1"
     launcher_cmd = (
-        "ft_launcher --monitor-interval=1"
+        "PYTHONFAULTHANDLER=1 ft_launcher --monitor-interval=1"
         f" --fault-tol-cfg-path={ft_cfg_path} --nproc-per-node={WORLD_SIZE} {cmd_to_run}"
     )
     ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT)
     assert "ALL RANKS STARTED" in output
     assert "RANK FAILED" in output
-    assert ret_code == 1
+    assert ret_code == 1, f"Launcher should return with 1. Ret value={ret_code}. Output=\n{output}"
 
 
 def test_ranks_exit_gracefully(tmp_dir):
@@ -116,13 +116,13 @@ def test_ranks_exit_gracefully(tmp_dir):
     ft_cfg_path = _save_ft_cfg(ft_cfg, tmp_dir)
     cmd_to_run = f"{_get_util_script_path()} --scenario={_get_func_name()}"
     launcher_cmd = (
-        "ft_launcher --monitor-interval=1"
+        "PYTHONFAULTHANDLER=1 ft_launcher --monitor-interval=1"
         f" --fault-tol-cfg-path={ft_cfg_path} --nproc-per-node={WORLD_SIZE} {cmd_to_run}"
     )
     ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT)
     assert "ALL RANKS STARTED" in output
     assert "RANK EXITS GRACEFULLY" in output
-    assert ret_code == 0
+    assert ret_code == 0, f"Launcher should return with 0. Ret value={ret_code}. Output=\n{output}"
 
 
 def test_launcher_sigterm_graceful_exit(tmp_dir):
@@ -136,14 +136,14 @@ def test_launcher_sigterm_graceful_exit(tmp_dir):
     ft_cfg_path = _save_ft_cfg(ft_cfg, tmp_dir)
     cmd_to_run = f"{_get_util_script_path()} --scenario={_get_func_name()} --term_handler=return0"
     launcher_cmd = (
-        "ft_launcher --monitor-interval=1"
+        "PYTHONFAULTHANDLER=1 ft_launcher --monitor-interval=1"
         f" --fault-tol-cfg-path={ft_cfg_path} --nproc-per-node={WORLD_SIZE} {cmd_to_run}"
     )
     ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT)
     assert "ALL RANKS STARTED" in output
     assert "SIGTERM SENT TO LAUNCHER" in output
     assert "RANK GOT SIGTERM: RETURN0" in output
-    assert ret_code == 0
+    assert ret_code == 0, f"Launcher should return with 0. Ret value={ret_code}. Output=\n{output}"
 
 
 def test_launcher_sigterm_ignored(tmp_dir):
@@ -157,14 +157,14 @@ def test_launcher_sigterm_ignored(tmp_dir):
     ft_cfg_path = _save_ft_cfg(ft_cfg, tmp_dir)
     cmd_to_run = f"{_get_util_script_path()} --scenario={_get_func_name()} --term_handler=ignore"
     launcher_cmd = (
-        "ft_launcher --term-timeout=5 --monitor-interval=1"
+        "PYTHONFAULTHANDLER=1 ft_launcher --term-timeout=5 --monitor-interval=1"
         f" --fault-tol-cfg-path={ft_cfg_path} --nproc-per-node={WORLD_SIZE} {cmd_to_run}"
     )
     ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT)
     assert "ALL RANKS STARTED" in output
     assert "SIGTERM SENT TO LAUNCHER" in output
     assert "RANK GOT SIGTERM: IGNORED" in output
-    assert ret_code == 1
+    assert ret_code == 1, f"Launcher should return with 1. Ret value={ret_code}. Output=\n{output}"
 
 
 def test_ranks_restart(tmp_dir):
@@ -178,7 +178,7 @@ def test_ranks_restart(tmp_dir):
     ft_cfg_path = _save_ft_cfg(ft_cfg, tmp_dir)
     cmd_to_run = f"{_get_util_script_path()} --scenario={_get_func_name()} --tmp_dir={tmp_dir}"
     launcher_cmd = (
-        "ft_launcher --max-restarts=2 --monitor-interval=1"
+        "PYTHONFAULTHANDLER=1 ft_launcher --max-restarts=2 --monitor-interval=1"
         f" --fault-tol-cfg-path={ft_cfg_path} --nproc-per-node={WORLD_SIZE} {cmd_to_run}"
     )
     ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT)
@@ -189,7 +189,7 @@ def test_ranks_restart(tmp_dir):
     assert "RANK FAILED" in output
     assert "RESTART #2" in output
     assert "RANK EXITS GRACEFULLY" in output
-    assert ret_code == 0
+    assert ret_code == 0, f"Launcher should return with 0. Ret value={ret_code}. Output=\n{output}"
 
 
 def test_missing_cfg(tmp_dir):
@@ -200,40 +200,44 @@ def test_missing_cfg(tmp_dir):
     # By default, launcher should raise an error if FT config cant be read
     cmd_to_run = f"{_get_util_script_path()} --scenario=test_ranks_exit_gracefully"
     launcher_cmd = (
-        "ft_launcher --monitor-interval=1"
+        "PYTHONFAULTHANDLER=1 ft_launcher --monitor-interval=1"
         f" --fault-tol-cfg-path={empty_ft_cfg_path} --nproc-per-node={WORLD_SIZE} {cmd_to_run}"
     )
     ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT)
-    assert ret_code != 0
+    assert (
+        ret_code != 0
+    ), f"Launcher should return with non 0. Ret value={ret_code}. Output=\n{output}"
     # Empty config file again, But this time there are FT args in CLI, so should be fine
     cmd_to_run = f"{_get_util_script_path()} --scenario=test_ranks_exit_gracefully"
     launcher_cmd = (
-        "ft_launcher --monitor-interval=1"
+        "PYTHONFAULTHANDLER=1 ft_launcher --monitor-interval=1"
         f" --fault-tol-cfg-path={empty_ft_cfg_path} --nproc-per-node={WORLD_SIZE} --ft-param-rank_heartbeat_timeout=1.0"
         f" {cmd_to_run}"
     )
     ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT)
-    assert ret_code == 0
+    assert ret_code == 0, f"Launcher should return with 0. Ret value={ret_code}. Output=\n{output}"
     # Empty config file again, launcher run with `--ignore-missing-fault-tol-cfg` should use defaults
     cmd_to_run = f"{_get_util_script_path()} --scenario=test_ranks_exit_gracefully"
     launcher_cmd = (
-        "ft_launcher --monitor-interval=1"
+        "PYTHONFAULTHANDLER=1 ft_launcher --monitor-interval=1"
         f" --fault-tol-cfg-path={empty_ft_cfg_path} --ignore-missing-fault-tol-cfg"
         f" --nproc-per-node={WORLD_SIZE} {cmd_to_run}"
     )
     ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT)
-    assert ret_code == 0
+    assert ret_code == 0, f"Launcher should return with 0. Ret value={ret_code}. Output=\n{output}"
     # Invalid config file path - should fail despite --ignore-missing-fault-tol-cfg and FT args specified via CLI
     cmd_to_run = f"{_get_util_script_path()} --scenario=test_ranks_exit_gracefully"
     launcher_cmd = (
-        "ft_launcher --monitor-interval=1"
+        "PYTHONFAULTHANDLER=1 ft_launcher --monitor-interval=1"
         " --fault-tol-cfg-path=/not/there.yaml"
         " --ft-param-rank_heartbeat_timeout=1.0"
         f" --nproc-per-node={WORLD_SIZE} --ignore-missing-fault-tol-cfg"
         f" {cmd_to_run}"
     )
     ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT)
-    assert ret_code != 0
+    assert (
+        ret_code != 0
+    ), f"Launcher should return with non 0. Ret value={ret_code}. Output=\n{output}"
 
 
 def test_config_provided_via_cli(tmp_dir):
@@ -246,9 +250,12 @@ def test_config_provided_via_cli(tmp_dir):
         " --ft-param-log_level=WARNING"
     )
     cmd_to_run = f"{_get_util_script_path()} --scenario=dump_cfg --tmp_dir={tmp_dir}"
-    launcher_cmd = "ft_launcher" f" {ft_params_str} --nproc-per-node={WORLD_SIZE} {cmd_to_run}"
+    launcher_cmd = (
+        "PYTHONFAULTHANDLER=1 ft_launcher"
+        f" {ft_params_str} --nproc-per-node={WORLD_SIZE} {cmd_to_run}"
+    )
     ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT)
-    assert ret_code == 0
+    assert ret_code == 0, f"Launcher should return with 0. Ret value={ret_code}. Output=\n{output}"
 
     dumped_ft_cfg_path = os.path.join(tmp_dir, "cfg_dump.yaml")
     assert os.path.exists(dumped_ft_cfg_path)
@@ -282,11 +289,11 @@ def test_config_provided_via_cli_overwrites_yaml(tmp_dir):
     )
     cmd_to_run = f"{_get_util_script_path()} --scenario=dump_cfg --tmp_dir={tmp_dir}"
     launcher_cmd = (
-        "ft_launcher"
+        "PYTHONFAULTHANDLER=1 ft_launcher"
         f" {ft_params_str} --fault-tol-cfg-path={ft_cfg_path} --nproc-per-node={WORLD_SIZE} {cmd_to_run}"
     )
     ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT)
-    assert ret_code == 0
+    assert ret_code == 0, f"Launcher should return with 0. Ret value={ret_code}. Output=\n{output}"
 
     dumped_ft_cfg_path = os.path.join(tmp_dir, "cfg_dump.yaml")
     assert os.path.exists(dumped_ft_cfg_path)

diff --git a/tests/fault_tolerance/unit/test_timeouts_calc.py b/tests/fault_tolerance/unit/test_timeouts_calc.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import faulthandler
 import shutil
 import sys
 import tempfile
@@ -50,6 +51,7 @@ def test_basic():
 
 
 def _rank_main(*args, tmp_dir, **kwargs):
+    faulthandler.enable(file=sys.stderr)
     tc = TimeoutsCalc(start_time=0, safety_factor=2.0)
     rank = dist.get_rank()
     if rank in [1, 2]:

diff --git a/tests/fault_tolerance/unit/test_update_state.py b/tests/fault_tolerance/unit/test_update_state.py
@@ -19,8 +19,6 @@
 
 from nvidia_resiliency_ext.fault_tolerance import dict_utils as ft_utils
 
-torch.set_default_device("cuda")
-
 
 def test_merge_state_dicts():
     d1 = {}

diff --git a/tests/fault_tolerance/unit/utils.py b/tests/fault_tolerance/unit/utils.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 import contextlib
+import gc
 import os
 import socket
 import sys
@@ -113,6 +114,10 @@ def distributed_worker(
 
     worker_fn(**kwargs)
 
+    # `destroy_process_group` hangs were observed in CI
+    # use GC collect and barrier to mitigate the issue
+    gc.collect()
+    torch.distributed.barrier()
     torch.distributed.destroy_process_group()
 
     sys.exit(0)
-Original file line number
+Diff line change
@@ Expand Up / @@ -19,8 +19,6 @@ @@
     from nvidia_resiliency_ext.fault_tolerance import dict_utils as ft_utils
-    torch.set_default_device("cuda")
     def test_merge_state_dicts():
         d1 = {}
@@ Expand Down @@