diff --git a/.github/workflows/build_docs.yml b/.github/workflows/build_docs.yml
index 9d8153bb..d1321605 100644
--- a/.github/workflows/build_docs.yml
+++ b/.github/workflows/build_docs.yml
@@ -15,7 +15,7 @@ jobs:
 
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Set up Python
         uses: actions/setup-python@v4
diff --git a/.github/workflows/lint_code.yml b/.github/workflows/lint_code.yml
index 54350df2..2be61def 100644
--- a/.github/workflows/lint_code.yml
+++ b/.github/workflows/lint_code.yml
@@ -14,7 +14,7 @@ jobs:
 
     steps:
       - name: Checkout repository
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Set up Python
         uses: actions/setup-python@v4
diff --git a/.github/workflows/unit_test.yml b/.github/workflows/unit_test.yml
new file mode 100644
index 00000000..eb1408c4
--- /dev/null
+++ b/.github/workflows/unit_test.yml
@@ -0,0 +1,94 @@
+name: Run Unit Tests
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+  
+jobs:
+    
+  build_wheels:
+    runs-on: ubuntu-24.04
+    container:
+      image: 'nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04'
+    steps:
+      - name: Update GCC
+        run: |
+          export DEBIAN_FRONTEND=noninteractive
+          apt update && apt install -y build-essential gcc-10 g++-10
+      - name: Install Python versions and pips
+        run: |
+          export DEBIAN_FRONTEND=noninteractive
+          apt update && apt install -y software-properties-common curl
+          add-apt-repository ppa:deadsnakes/ppa
+          apt-get install -y python3.10 python3.10-dev python3.10-distutils
+          apt-get install -y python3.11 python3.11-dev python3.11-distutils
+          apt-get install -y python3.12 python3.12-dev python3.12-distutils
+          curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10
+          curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11
+          curl -sS https://bootstrap.pypa.io/get-pip.py | python3.12
+      - name: Checkout code
+        uses: actions/checkout@v4
+      - name: Build wheel with Python 3.10
+        run: |
+          python3.10 -m pip install -U poetry build six
+          python3.10 -m poetry build -f wheel
+      - name: Build wheel with Python 3.11
+        run: |
+          python3.11 -m pip install -U poetry build six
+          python3.11 -m poetry build -f wheel
+      - name: Build wheel with Python 3.12
+        run: |
+          python3.12 -m pip install -U poetry build six
+          python3.12 -m poetry build -f wheel
+      - name: Upload the wheel artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: resiliency-wheels
+          path: dist/*.whl
+
+  unit_tests:
+    runs-on: ubuntu-24.04
+    needs: build_wheels
+    strategy:
+      matrix:
+        container:
+          - 'pytorch/pytorch:2.4.1-cuda12.1-cudnn9-runtime'
+          - 'pytorch/pytorch:2.2.2-cuda12.1-cudnn8-runtime'
+          - 'pytorch/pytorch:2.1.0-cuda11.8-cudnn8-runtime'
+        test_type: ['fault_tolerance', 'straggler', 'ptl_resiliency']
+    container:
+      image: ${{ matrix.container }}
+    env:
+        MKL_SERVICE_FORCE_INTEL: 1 # Fix for "MKL_THREADING_LAYER=INTEL is incompatible with libgomp.so.1 library."
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+      - name: Download wheels
+        uses: actions/download-artifact@v4
+        with:
+          name: resiliency-wheels
+          path: ./dist/
+      - name: Set up environment
+        run: |
+          pip install pytest lightning
+          PY_VER_NODOT=$(python -c"import sysconfig; print(sysconfig.get_config_var('py_version_nodot'))")
+          pip install ./dist/nvidia_resiliency_ext-*-cp${PY_VER_NODOT}-*.whl
+      - name: Run unit tests
+        shell: bash
+        run: |
+          if [[ "${{ matrix.test_type }}" == "straggler" ]]; then
+            pytest -s -vvv -m "not gpu" ./tests/straggler/unit/
+            exit 0
+          elif [[ "${{ matrix.test_type }}" == "ptl_resiliency" ]]; then
+            pytest -s -vvv -m "not gpu" ./tests/ptl_resiliency/unit/
+            exit 0
+          elif [[ "${{ matrix.test_type }}" == "fault_tolerance" ]]; then
+            pytest -s -vvv -m "not gpu" ./tests/fault_tolerance/unit/
+            exit 0
+          else
+            echo "Unknown test type: ${{ matrix.test_type }}"
+            exit 1
+          fi
diff --git a/Dockerfile.builder b/Dockerfile.builder
deleted file mode 100644
index b128b263..00000000
--- a/Dockerfile.builder
+++ /dev/null
@@ -1,43 +0,0 @@
-# This image purpose is to build "nvidia_resiliency_ext" wheels using different Python versions.
-# There are python3.10, python3.11 and python3.12 installed.
-# Base image is CUDA, as Straggler Detection package uses CUPTI.
-# Wheel for Python3.10 can be created with "python3.10 -m build --wheel" etc.
-
-# Choose a base CUDA image from NVIDIA
-# nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04, nvidia/cuda:12.4.1-cudnn-devel-ubuntu20.04 etc.
-ARG BASE_CUDA_IMG=nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
-FROM ${BASE_CUDA_IMG}
-
-# Set environment variables to non-interactive to avoid prompts during package installation
-ENV DEBIAN_FRONTEND=noninteractive
-
-# Repo with Pythons
-RUN apt update && apt install -y software-properties-common && add-apt-repository ppa:deadsnakes/ppa
-
-# Install common dependencies
-RUN apt-get update && apt-get install -y \
-    python3.10 python3.10-dev python3.10-distutils \
-    python3.11 python3.11-dev python3.11-distutils \
-    python3.12 python3.12-dev python3.12-distutils \
-    wget curl build-essential gcc-10 g++-10\
-    && rm -rf /var/lib/apt/lists/*
-
-# Install pip for each Python version
-RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10 && \
-    curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11 && \
-    curl -sS https://bootstrap.pypa.io/get-pip.py | python3.12
-
-# Install deps,
-# FIXME: for some reason six needs to be manually updated
-# otherwise wheel building fails with: ModuleNotFoundError: No module named 'six'
-RUN python3.10 -m pip install build poetry && \
-    python3.11 -m pip install build poetry && \
-    python3.12 -m pip install build poetry && \
-    python3.10 -m pip install -U six && \
-    python3.11 -m pip install -U six && \
-    python3.12 -m pip install -U six
-
-# Set the working directory
-WORKDIR /workspace
-
-ENTRYPOINT ["/bin/bash", "-c"]
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 00000000..eb606adf
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,3 @@
+[pytest]
+markers =
+    gpu: tests that require GPU
diff --git a/tests/fault_tolerance/unit/test_launcher.py b/tests/fault_tolerance/unit/test_launcher.py
index 6ac842a4..050a7071 100644
--- a/tests/fault_tolerance/unit/test_launcher.py
+++ b/tests/fault_tolerance/unit/test_launcher.py
@@ -47,7 +47,7 @@ def _run_launcher(cmd_to_run, timeout):
             cmd_to_run,
             shell=True,
             stdout=subprocess.PIPE,
-            stderr=subprocess.DEVNULL,
+            stderr=subprocess.PIPE,
             text=True,
         )
         stdout, _ = proc.communicate(timeout=timeout)
@@ -79,13 +79,13 @@ def test_rank_not_send_initial_hb(tmp_dir):
     ft_cfg_path = _save_ft_cfg(ft_cfg, tmp_dir)
     cmd_to_run = f"{_get_util_script_path()} --scenario={_get_func_name()} --which_rank=1"
     launcher_cmd = (
-        "ft_launcher --monitor-interval=1"
+        "PYTHONFAULTHANDLER=1 ft_launcher --monitor-interval=1"
         f" --fault-tol-cfg-path={ft_cfg_path} --nproc-per-node={WORLD_SIZE} {cmd_to_run}"
     )
     ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT)
     assert "ALL RANKS STARTED" in output
     assert "RANK IS SKIPPING INITIAL HB" in output
-    assert ret_code == 1
+    assert ret_code == 1, f"Launcher should return with 1. Ret value={ret_code}. Output=\n{output}"
 
 
 def test_rank_failed(tmp_dir):
@@ -98,13 +98,13 @@ def test_rank_failed(tmp_dir):
     ft_cfg_path = _save_ft_cfg(ft_cfg, tmp_dir)
     cmd_to_run = f"{_get_util_script_path()} --scenario={_get_func_name()} --which_rank=1"
     launcher_cmd = (
-        "ft_launcher --monitor-interval=1"
+        "PYTHONFAULTHANDLER=1 ft_launcher --monitor-interval=1"
         f" --fault-tol-cfg-path={ft_cfg_path} --nproc-per-node={WORLD_SIZE} {cmd_to_run}"
     )
     ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT)
     assert "ALL RANKS STARTED" in output
     assert "RANK FAILED" in output
-    assert ret_code == 1
+    assert ret_code == 1, f"Launcher should return with 1. Ret value={ret_code}. Output=\n{output}"
 
 
 def test_ranks_exit_gracefully(tmp_dir):
@@ -116,13 +116,13 @@ def test_ranks_exit_gracefully(tmp_dir):
     ft_cfg_path = _save_ft_cfg(ft_cfg, tmp_dir)
     cmd_to_run = f"{_get_util_script_path()} --scenario={_get_func_name()}"
     launcher_cmd = (
-        "ft_launcher --monitor-interval=1"
+        "PYTHONFAULTHANDLER=1 ft_launcher --monitor-interval=1"
         f" --fault-tol-cfg-path={ft_cfg_path} --nproc-per-node={WORLD_SIZE} {cmd_to_run}"
     )
     ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT)
     assert "ALL RANKS STARTED" in output
     assert "RANK EXITS GRACEFULLY" in output
-    assert ret_code == 0
+    assert ret_code == 0, f"Launcher should return with 0. Ret value={ret_code}. Output=\n{output}"
 
 
 def test_launcher_sigterm_graceful_exit(tmp_dir):
@@ -136,14 +136,14 @@ def test_launcher_sigterm_graceful_exit(tmp_dir):
     ft_cfg_path = _save_ft_cfg(ft_cfg, tmp_dir)
     cmd_to_run = f"{_get_util_script_path()} --scenario={_get_func_name()} --term_handler=return0"
     launcher_cmd = (
-        "ft_launcher --monitor-interval=1"
+        "PYTHONFAULTHANDLER=1 ft_launcher --monitor-interval=1"
         f" --fault-tol-cfg-path={ft_cfg_path} --nproc-per-node={WORLD_SIZE} {cmd_to_run}"
     )
     ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT)
     assert "ALL RANKS STARTED" in output
     assert "SIGTERM SENT TO LAUNCHER" in output
     assert "RANK GOT SIGTERM: RETURN0" in output
-    assert ret_code == 0
+    assert ret_code == 0, f"Launcher should return with 0. Ret value={ret_code}. Output=\n{output}"
 
 
 def test_launcher_sigterm_ignored(tmp_dir):
@@ -157,14 +157,14 @@ def test_launcher_sigterm_ignored(tmp_dir):
     ft_cfg_path = _save_ft_cfg(ft_cfg, tmp_dir)
     cmd_to_run = f"{_get_util_script_path()} --scenario={_get_func_name()} --term_handler=ignore"
     launcher_cmd = (
-        "ft_launcher --term-timeout=5 --monitor-interval=1"
+        "PYTHONFAULTHANDLER=1 ft_launcher --term-timeout=5 --monitor-interval=1"
         f" --fault-tol-cfg-path={ft_cfg_path} --nproc-per-node={WORLD_SIZE} {cmd_to_run}"
     )
     ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT)
     assert "ALL RANKS STARTED" in output
     assert "SIGTERM SENT TO LAUNCHER" in output
     assert "RANK GOT SIGTERM: IGNORED" in output
-    assert ret_code == 1
+    assert ret_code == 1, f"Launcher should return with 1. Ret value={ret_code}. Output=\n{output}"
 
 
 def test_ranks_restart(tmp_dir):
@@ -178,7 +178,7 @@ def test_ranks_restart(tmp_dir):
     ft_cfg_path = _save_ft_cfg(ft_cfg, tmp_dir)
     cmd_to_run = f"{_get_util_script_path()} --scenario={_get_func_name()} --tmp_dir={tmp_dir}"
     launcher_cmd = (
-        "ft_launcher --max-restarts=2 --monitor-interval=1"
+        "PYTHONFAULTHANDLER=1 ft_launcher --max-restarts=2 --monitor-interval=1"
         f" --fault-tol-cfg-path={ft_cfg_path} --nproc-per-node={WORLD_SIZE} {cmd_to_run}"
     )
     ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT)
@@ -189,7 +189,7 @@ def test_ranks_restart(tmp_dir):
     assert "RANK FAILED" in output
     assert "RESTART #2" in output
     assert "RANK EXITS GRACEFULLY" in output
-    assert ret_code == 0
+    assert ret_code == 0, f"Launcher should return with 0. Ret value={ret_code}. Output=\n{output}"
 
 
 def test_missing_cfg(tmp_dir):
@@ -200,40 +200,44 @@ def test_missing_cfg(tmp_dir):
     # By default, launcher should raise an error if FT config cant be read
     cmd_to_run = f"{_get_util_script_path()} --scenario=test_ranks_exit_gracefully"
     launcher_cmd = (
-        "ft_launcher --monitor-interval=1"
+        "PYTHONFAULTHANDLER=1 ft_launcher --monitor-interval=1"
         f" --fault-tol-cfg-path={empty_ft_cfg_path} --nproc-per-node={WORLD_SIZE} {cmd_to_run}"
     )
     ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT)
-    assert ret_code != 0
+    assert (
+        ret_code != 0
+    ), f"Launcher should return with non 0. Ret value={ret_code}. Output=\n{output}"
     # Empty config file again, But this time there are FT args in CLI, so should be fine
     cmd_to_run = f"{_get_util_script_path()} --scenario=test_ranks_exit_gracefully"
     launcher_cmd = (
-        "ft_launcher --monitor-interval=1"
+        "PYTHONFAULTHANDLER=1 ft_launcher --monitor-interval=1"
         f" --fault-tol-cfg-path={empty_ft_cfg_path} --nproc-per-node={WORLD_SIZE} --ft-param-rank_heartbeat_timeout=1.0"
         f" {cmd_to_run}"
     )
     ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT)
-    assert ret_code == 0
+    assert ret_code == 0, f"Launcher should return with 0. Ret value={ret_code}. Output=\n{output}"
     # Empty config file again, launcher run with `--ignore-missing-fault-tol-cfg` should use defaults
     cmd_to_run = f"{_get_util_script_path()} --scenario=test_ranks_exit_gracefully"
     launcher_cmd = (
-        "ft_launcher --monitor-interval=1"
+        "PYTHONFAULTHANDLER=1 ft_launcher --monitor-interval=1"
         f" --fault-tol-cfg-path={empty_ft_cfg_path} --ignore-missing-fault-tol-cfg"
         f" --nproc-per-node={WORLD_SIZE} {cmd_to_run}"
     )
     ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT)
-    assert ret_code == 0
+    assert ret_code == 0, f"Launcher should return with 0. Ret value={ret_code}. Output=\n{output}"
     # Invalid config file path - should fail despite --ignore-missing-fault-tol-cfg and FT args specified via CLI
     cmd_to_run = f"{_get_util_script_path()} --scenario=test_ranks_exit_gracefully"
     launcher_cmd = (
-        "ft_launcher --monitor-interval=1"
+        "PYTHONFAULTHANDLER=1 ft_launcher --monitor-interval=1"
         " --fault-tol-cfg-path=/not/there.yaml"
         " --ft-param-rank_heartbeat_timeout=1.0"
         f" --nproc-per-node={WORLD_SIZE} --ignore-missing-fault-tol-cfg"
         f" {cmd_to_run}"
     )
     ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT)
-    assert ret_code != 0
+    assert (
+        ret_code != 0
+    ), f"Launcher should return with non 0. Ret value={ret_code}. Output=\n{output}"
 
 
 def test_config_provided_via_cli(tmp_dir):
@@ -246,9 +250,12 @@ def test_config_provided_via_cli(tmp_dir):
         " --ft-param-log_level=WARNING"
     )
     cmd_to_run = f"{_get_util_script_path()} --scenario=dump_cfg --tmp_dir={tmp_dir}"
-    launcher_cmd = "ft_launcher" f" {ft_params_str} --nproc-per-node={WORLD_SIZE} {cmd_to_run}"
+    launcher_cmd = (
+        "PYTHONFAULTHANDLER=1 ft_launcher"
+        f" {ft_params_str} --nproc-per-node={WORLD_SIZE} {cmd_to_run}"
+    )
     ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT)
-    assert ret_code == 0
+    assert ret_code == 0, f"Launcher should return with 0. Ret value={ret_code}. Output=\n{output}"
 
     dumped_ft_cfg_path = os.path.join(tmp_dir, "cfg_dump.yaml")
     assert os.path.exists(dumped_ft_cfg_path)
@@ -282,11 +289,11 @@ def test_config_provided_via_cli_overwrites_yaml(tmp_dir):
     )
     cmd_to_run = f"{_get_util_script_path()} --scenario=dump_cfg --tmp_dir={tmp_dir}"
     launcher_cmd = (
-        "ft_launcher"
+        "PYTHONFAULTHANDLER=1 ft_launcher"
         f" {ft_params_str} --fault-tol-cfg-path={ft_cfg_path} --nproc-per-node={WORLD_SIZE} {cmd_to_run}"
     )
     ret_code, output = _run_launcher(launcher_cmd, DEFAULT_TIMEOUT)
-    assert ret_code == 0
+    assert ret_code == 0, f"Launcher should return with 0. Ret value={ret_code}. Output=\n{output}"
 
     dumped_ft_cfg_path = os.path.join(tmp_dir, "cfg_dump.yaml")
     assert os.path.exists(dumped_ft_cfg_path)
diff --git a/tests/fault_tolerance/unit/test_timeouts_calc.py b/tests/fault_tolerance/unit/test_timeouts_calc.py
index 08f021b3..bb5e9868 100644
--- a/tests/fault_tolerance/unit/test_timeouts_calc.py
+++ b/tests/fault_tolerance/unit/test_timeouts_calc.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import faulthandler
 import shutil
 import sys
 import tempfile
@@ -50,6 +51,7 @@ def test_basic():
 
 
 def _rank_main(*args, tmp_dir, **kwargs):
+    faulthandler.enable(file=sys.stderr)
     tc = TimeoutsCalc(start_time=0, safety_factor=2.0)
     rank = dist.get_rank()
     if rank in [1, 2]:
diff --git a/tests/fault_tolerance/unit/test_update_state.py b/tests/fault_tolerance/unit/test_update_state.py
index 85a48194..61071f63 100644
--- a/tests/fault_tolerance/unit/test_update_state.py
+++ b/tests/fault_tolerance/unit/test_update_state.py
@@ -19,8 +19,6 @@
 
 from nvidia_resiliency_ext.fault_tolerance import dict_utils as ft_utils
 
-torch.set_default_device("cuda")
-
 
 def test_merge_state_dicts():
     d1 = {}
diff --git a/tests/fault_tolerance/unit/utils.py b/tests/fault_tolerance/unit/utils.py
index bba16133..542bb948 100644
--- a/tests/fault_tolerance/unit/utils.py
+++ b/tests/fault_tolerance/unit/utils.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 import contextlib
+import gc
 import os
 import socket
 import sys
@@ -113,6 +114,10 @@ def distributed_worker(
 
     worker_fn(**kwargs)
 
+    # `destroy_process_group` hangs were observed in CI
+    # use GC collect and barrier to mitigate the issue
+    gc.collect()
+    torch.distributed.barrier()
     torch.distributed.destroy_process_group()
 
     sys.exit(0)
diff --git a/tests/ptl_resiliency/unit/test_ft_callback.py b/tests/ptl_resiliency/unit/test_ft_callback.py
index 98ae63f3..bce782b7 100644
--- a/tests/ptl_resiliency/unit/test_ft_callback.py
+++ b/tests/ptl_resiliency/unit/test_ft_callback.py
@@ -206,7 +206,7 @@ def _run_trainining(
     trainer = pl.Trainer(
         strategy='ddp',
         devices=1,
-        accelerator='gpu',
+        accelerator='cpu',
         logger=False,
         max_steps=max_steps,
         max_epochs=max_epochs,
@@ -243,7 +243,7 @@ def _run_eval(tmp_path, which='not set'):
     trainer = pl.Trainer(
         strategy='ddp',
         devices=1,
-        accelerator='gpu',
+        accelerator='cpu',
         logger=False,
         callbacks=[fault_tol_cb, checkpoint_callback],
     )
diff --git a/tests/ptl_resiliency/unit/test_straggler_det_callback.py b/tests/ptl_resiliency/unit/test_straggler_det_callback.py
index dfb48862..6cbdba79 100644
--- a/tests/ptl_resiliency/unit/test_straggler_det_callback.py
+++ b/tests/ptl_resiliency/unit/test_straggler_det_callback.py
@@ -26,6 +26,8 @@
 
 from nvidia_resiliency_ext.ptl_resiliency import StragglerDetectionCallback
 
+pytestmark = pytest.mark.gpu
+
 
 class OnesDataset(torch.utils.data.Dataset):
     def __init__(self, dataset_len):
diff --git a/tests/straggler/unit/_utils.py b/tests/straggler/unit/_utils.py
index bba16133..542bb948 100644
--- a/tests/straggler/unit/_utils.py
+++ b/tests/straggler/unit/_utils.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 import contextlib
+import gc
 import os
 import socket
 import sys
@@ -113,6 +114,10 @@ def distributed_worker(
 
     worker_fn(**kwargs)
 
+    # `destroy_process_group` hangs were observed in CI
+    # use GC collect and barrier to mitigate the issue
+    gc.collect()
+    torch.distributed.barrier()
     torch.distributed.destroy_process_group()
 
     sys.exit(0)
diff --git a/tests/straggler/unit/test_cupti_ext.py b/tests/straggler/unit/test_cupti_ext.py
index bc954151..5f4a4dca 100644
--- a/tests/straggler/unit/test_cupti_ext.py
+++ b/tests/straggler/unit/test_cupti_ext.py
@@ -19,6 +19,8 @@
 
 from nvidia_resiliency_ext.straggler import cupti_module
 
+pytestmark = pytest.mark.gpu
+
 
 def test_basic_kernel_tracking():
     cupti_ext = cupti_module.CuptiProfiler()
diff --git a/tests/straggler/unit/test_cupti_manager.py b/tests/straggler/unit/test_cupti_manager.py
index e94c2427..5d88f81c 100644
--- a/tests/straggler/unit/test_cupti_manager.py
+++ b/tests/straggler/unit/test_cupti_manager.py
@@ -18,6 +18,8 @@
 
 from nvidia_resiliency_ext.straggler.cupti import CuptiManager
 
+pytestmark = pytest.mark.gpu
+
 
 def test_cupti_manager_start_stop():
     cupti_mgr = CuptiManager()
diff --git a/tests/straggler/unit/test_det_section_api.py b/tests/straggler/unit/test_det_section_api.py
index cf876f89..123763c5 100644
--- a/tests/straggler/unit/test_det_section_api.py
+++ b/tests/straggler/unit/test_det_section_api.py
@@ -20,6 +20,8 @@
 
 from nvidia_resiliency_ext import straggler
 
+pytestmark = pytest.mark.gpu
+
 
 @pytest.fixture
 def _straggler_init_shutdown():
diff --git a/tests/straggler/unit/test_interval_tracker.py b/tests/straggler/unit/test_interval_tracker.py
index 5e18403c..14cdbda1 100644
--- a/tests/straggler/unit/test_interval_tracker.py
+++ b/tests/straggler/unit/test_interval_tracker.py
@@ -15,8 +15,13 @@
 
 import time
 
+import pytest
+
 from nvidia_resiliency_ext.straggler import interval_tracker
 
+# FIXME: should work without GPU as well
+pytestmark = pytest.mark.gpu
+
 
 def test_estimate():
 
diff --git a/tests/straggler/unit/test_reporting.py b/tests/straggler/unit/test_reporting.py
index 00e3c3a6..3efbd0f1 100644
--- a/tests/straggler/unit/test_reporting.py
+++ b/tests/straggler/unit/test_reporting.py
@@ -25,6 +25,8 @@
 
 from nvidia_resiliency_ext import straggler
 
+pytestmark = pytest.mark.gpu
+
 
 class Layer(nn.Module):
     def __init__(self, in_features, out_features, bias):
diff --git a/tests/straggler/unit/test_reporting_elapsed.py b/tests/straggler/unit/test_reporting_elapsed.py
index 6827910d..c3a92c94 100644
--- a/tests/straggler/unit/test_reporting_elapsed.py
+++ b/tests/straggler/unit/test_reporting_elapsed.py
@@ -24,6 +24,8 @@
 
 from nvidia_resiliency_ext import straggler
 
+pytestmark = pytest.mark.gpu
+
 
 class Layer(nn.Module):
     def __init__(self, in_features, out_features, bias):
diff --git a/tests/straggler/unit/test_sections.py b/tests/straggler/unit/test_sections.py
index 63c2e3ee..842a2fa5 100644
--- a/tests/straggler/unit/test_sections.py
+++ b/tests/straggler/unit/test_sections.py
@@ -24,6 +24,9 @@
 
 from ._utils import multiprocessing_execute_join, multiprocessing_execute_start
 
+# FIXME: should work without GPU as well
+pytestmark = pytest.mark.gpu
+
 # This is a basic test of custom sections:
 # - Run distributed dummy workload, where each rank has 3 custom sections
 # - Each rank executes the sections in a loop
diff --git a/tests/straggler/unit/test_wrap_callables.py b/tests/straggler/unit/test_wrap_callables.py
index 3025d3b2..ff75297f 100644
--- a/tests/straggler/unit/test_wrap_callables.py
+++ b/tests/straggler/unit/test_wrap_callables.py
@@ -24,6 +24,8 @@
 
 from ._utils import multiprocessing_execute_join, multiprocessing_execute_start
 
+pytestmark = pytest.mark.gpu
+
 TEST_WORLD_SIZE = 4
 ALL_RANK_IDS = set(range(TEST_WORLD_SIZE))
 RANK_DONE_TIMEOUT = 30