Skip to content

Commit 0ee3d84

Browse files
Update to v0.2.0
0 parents  commit 0ee3d84

File tree

250 files changed

+38106
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

250 files changed

+38106
-0
lines changed

.github/workflows/build_docs.yml

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
name: Build and Deploy Docs
2+
3+
on:
4+
push:
5+
branches:
6+
- main
7+
pull_request:
8+
branches:
9+
- main
10+
workflow_dispatch:
11+
12+
jobs:
13+
build_docs:
14+
runs-on: ubuntu-latest
15+
16+
steps:
17+
- name: Checkout repository
18+
uses: actions/checkout@v4
19+
20+
- name: Set up Python
21+
uses: actions/setup-python@v4
22+
with:
23+
python-version: '3.10'
24+
25+
- name: Install dependencies
26+
run: |
27+
pip install -U sphinx sphinx-rtd-theme sphinxcontrib-napoleon psutil
28+
29+
- name: Build Documentation
30+
run: |
31+
sphinx-build -b html docs/source public/
32+
if [ ! -d "public" ]; then
33+
echo "Error: Documentation build failed. 'public/' directory not found."
34+
exit 1
35+
fi
36+
37+
- name: Deploy to GitHub Pages
38+
run: |
39+
git config --global user.name "GitHub Actions"
40+
git config --global user.email "[email protected]"
41+
42+
git reset --hard
43+
git clean -fdx
44+
45+
if git ls-remote --exit-code origin gh-pages; then
46+
git fetch origin gh-pages
47+
git checkout gh-pages
48+
else
49+
git checkout --orphan gh-pages
50+
fi
51+
52+
if [ -d "public" ]; then
53+
echo "public directory exists. Proceeding with deployment."
54+
55+
find . -maxdepth 1 ! -name '.git' ! -name '.' ! -name 'public' -exec rm -rf {} +
56+
57+
cp -r public/* .
58+
59+
touch .nojekyll
60+
61+
git add .
62+
if git diff --cached --quiet; then
63+
echo "No changes to commit. Skipping deployment."
64+
exit 0
65+
else
66+
git commit -m "Deploy updated documentation to GitHub Pages from commit $GITHUB_SHA"
67+
git push origin gh-pages --force
68+
fi
69+
else
70+
echo "Error: 'public/' directory does not exist during deployment."
71+
exit 1
72+
fi

.github/workflows/lint_code.yml

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
name: Lint Code
2+
3+
on:
4+
push:
5+
branches:
6+
- main
7+
pull_request:
8+
branches:
9+
- main
10+
11+
jobs:
12+
lint:
13+
runs-on: ubuntu-latest
14+
15+
steps:
16+
- name: Checkout repository
17+
uses: actions/checkout@v4
18+
19+
- name: Set up Python
20+
uses: actions/setup-python@v4
21+
with:
22+
python-version: '3.10'
23+
24+
- name: Install linting dependencies
25+
run: |
26+
pip install black==24.10.0 isort==5.13.2 ruff==0.6.9
27+
28+
- name: Run Black
29+
run: |
30+
black --check .
31+
32+
- name: Run isort
33+
run: |
34+
isort --check-only .
35+
36+
- name: Run Ruff
37+
run: |
38+
ruff check .

.github/workflows/unit_test.yml

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
name: Run Unit Tests
2+
on:
3+
push:
4+
branches:
5+
- main
6+
pull_request:
7+
branches:
8+
- main
9+
10+
jobs:
11+
12+
build_wheels:
13+
runs-on: ubuntu-24.04
14+
container:
15+
image: 'nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04'
16+
steps:
17+
- name: Update GCC
18+
run: |
19+
export DEBIAN_FRONTEND=noninteractive
20+
apt update && apt install -y build-essential gcc-10 g++-10
21+
- name: Install Python versions and pips
22+
run: |
23+
export DEBIAN_FRONTEND=noninteractive
24+
apt update && apt install -y software-properties-common curl
25+
add-apt-repository ppa:deadsnakes/ppa
26+
apt-get install -y python3.10 python3.10-dev python3.10-distutils
27+
apt-get install -y python3.11 python3.11-dev python3.11-distutils
28+
apt-get install -y python3.12 python3.12-dev python3.12-distutils
29+
curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10
30+
curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11
31+
curl -sS https://bootstrap.pypa.io/get-pip.py | python3.12
32+
- name: Checkout code
33+
uses: actions/checkout@v4
34+
- name: Build wheel with Python 3.10
35+
run: |
36+
python3.10 -m pip install -U poetry build six
37+
python3.10 -m poetry build -f wheel
38+
- name: Build wheel with Python 3.11
39+
run: |
40+
python3.11 -m pip install -U poetry build six
41+
python3.11 -m poetry build -f wheel
42+
- name: Build wheel with Python 3.12
43+
run: |
44+
python3.12 -m pip install -U poetry build six
45+
python3.12 -m poetry build -f wheel
46+
- name: Upload the wheel artifact
47+
uses: actions/upload-artifact@v4
48+
with:
49+
name: resiliency-wheels
50+
path: dist/*.whl
51+
52+
unit_tests:
53+
runs-on: ubuntu-24.04
54+
needs: build_wheels
55+
strategy:
56+
matrix:
57+
container:
58+
- 'pytorch/pytorch:2.4.1-cuda12.1-cudnn9-runtime'
59+
- 'pytorch/pytorch:2.2.2-cuda12.1-cudnn8-runtime'
60+
- 'pytorch/pytorch:2.1.0-cuda11.8-cudnn8-runtime'
61+
test_type: ['fault_tolerance', 'straggler', 'ptl_resiliency']
62+
container:
63+
image: ${{ matrix.container }}
64+
env:
65+
MKL_SERVICE_FORCE_INTEL: 1 # Fix for "MKL_THREADING_LAYER=INTEL is incompatible with libgomp.so.1 library."
66+
steps:
67+
- name: Checkout code
68+
uses: actions/checkout@v4
69+
- name: Download wheels
70+
uses: actions/download-artifact@v4
71+
with:
72+
name: resiliency-wheels
73+
path: ./dist/
74+
- name: Set up environment
75+
run: |
76+
pip install pytest lightning
77+
PY_VER_NODOT=$(python -c"import sysconfig; print(sysconfig.get_config_var('py_version_nodot'))")
78+
pip install ./dist/nvidia_resiliency_ext-*-cp${PY_VER_NODOT}-*.whl
79+
- name: Run unit tests
80+
shell: bash
81+
run: |
82+
if [[ "${{ matrix.test_type }}" == "straggler" ]]; then
83+
pytest -s -vvv -m "not gpu" ./tests/straggler/unit/
84+
exit 0
85+
elif [[ "${{ matrix.test_type }}" == "ptl_resiliency" ]]; then
86+
pytest -s -vvv -m "not gpu" ./tests/ptl_resiliency/unit/
87+
exit 0
88+
elif [[ "${{ matrix.test_type }}" == "fault_tolerance" ]]; then
89+
pytest -s -vvv -m "not gpu" ./tests/fault_tolerance/unit/
90+
exit 0
91+
else
92+
echo "Unknown test type: ${{ matrix.test_type }}"
93+
exit 1
94+
fi

.gitignore

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
build
2+
dist
3+
*.egg-info
4+
__pycache__
5+
cupti_module.*.so
6+
.pytest_cache

.pre-commit-config.yaml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
default_language_version:
2+
python: python3
3+
4+
repos:
5+
6+
- repo: https://github.com/PyCQA/isort
7+
rev: 5.13.2
8+
hooks:
9+
- id: isort
10+
exclude: docs/
11+
12+
- repo: https://github.com/psf/black-pre-commit-mirror
13+
rev: 24.10.0
14+
hooks:
15+
- id: black
16+
language_version: python3.10
17+
18+
- repo: https://github.com/astral-sh/ruff-pre-commit
19+
rev: v0.6.9
20+
hooks:
21+
- id: ruff

CONTRIBUTING.md

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
2+
## Nvidia Resiliency Extension (NVRx) OSS Contribution Rules
3+
4+
#### Issue Tracking
5+
6+
* All enhancement, bugfix, or change requests must begin with the creation of a [NVRx Issue Request](TBD).
7+
* The issue request must be reviewed by NVRx engineers and approved prior to code review.
8+
9+
10+
#### Coding Guidelines
11+
12+
- All source code contributions must follow the existing conventions in the relevant file, submodule, module, and project when you add new code or when you extend/fix existing functionality.
13+
14+
- Avoid introducing unnecessary complexity into existing code so that maintainability and readability are preserved.
15+
16+
- Try to keep pull requests (PRs) as concise as possible:
17+
- Avoid committing commented-out code.
18+
- Wherever possible, each PR should address a single concern. If there are several otherwise-unrelated things that should be fixed to reach a desired endpoint, our recommendation is to open several PRs and indicate the dependencies in the description. The more complex the changes are in a single PR, the more time it will take to review those changes.
19+
20+
- To ensure code consistency and maintainability across the project, please format and lint your code using the following tools before committing any changes:
21+
- We use black to automatically format Python code. It enforces a consistent style by reformatting code according to a set of rules.
22+
- To format your code, run:
23+
```
24+
black .
25+
```
26+
- isort is used to sort and format import statements automatically. Ensure that your imports are ordered correctly by running:
27+
```
28+
isort .
29+
```
30+
- ruff is a fast Python linter that helps catch common issues. Please run ruff to check for and fix linting problems:
31+
```
32+
ruff check .
33+
```
34+
35+
- Write commit titles using imperative mood and [these rules](https://chris.beams.io/posts/git-commit/), and reference the Issue number corresponding to the PR. Following is the recommended format for commit texts:
36+
```
37+
#<Issue Number> - <Commit Title>
38+
39+
<Commit Body>
40+
```
41+
42+
- Ensure that the build log is clean, meaning no warnings or errors should be present.
43+
44+
- Ensure that all unit tests pass prior to submitting your code.
45+
46+
- All OSS components must contain accompanying documentation (READMEs) describing the functionality, dependencies, and known issues.
47+
48+
- See `README.md` for existing samples and plugins for reference.
49+
50+
- All OSS components must have an accompanying test.
51+
52+
- If introducing a new component, such as a plugin, provide a test sample to verify the functionality.
53+
54+
- Make sure that you can contribute your work to open source (no license and/or patent conflict is introduced by your code). You will need to [`sign`](#signing-your-work) your commit.
55+
56+
- Thanks in advance for your patience as we review your contributions; we do appreciate them!
57+
58+
59+
#### Pull Requests
60+
Developer workflow for code contributions is as follows:
61+
62+
1. Developers must first [fork](https://help.github.com/en/articles/fork-a-repo) the [upstream](TBD) NVRx OSS repository.
63+
64+
2. Git clone the forked repository and push changes to the personal fork.
65+
66+
```bash
67+
git clone https://github.com/YOUR_USERNAME/YOUR_FORK.git NVRx
68+
# Checkout the targeted branch and commit changes
69+
# Push the commits to a branch on the fork (remote).
70+
git push -u origin <local-branch>:<remote-branch>
71+
```
72+
73+
3. Once the code changes are staged on the fork and ready for review, a [Pull Request](https://help.github.com/en/articles/about-pull-requests) (PR) can be [requested](https://help.github.com/en/articles/creating-a-pull-request) to merge the changes from a branch of the fork into a selected branch of upstream.
74+
* Exercise caution when selecting the source and target branches for the PR.
75+
Note that versioned releases of NVRx OSS are posted to `release/` branches of the upstream repo.
76+
* Creation of a PR creation kicks off the code review process.
77+
* Atleast one NVRx engineer will be assigned for the review.
78+
* While under review, mark your PRs as work-in-progress by prefixing the PR title with [WIP].
79+
80+
4. Since there is no CI/CD process in place yet, the PR will be accepted and the corresponding issue closed only after adequate testing has been completed, manually, by the developer and/or NVRx engineer reviewing the code.
81+
82+
83+
#### Signing Your Work
84+
85+
* We require that all contributors "sign-off" on their commits. This certifies that the contribution is your original work, or you have rights to submit it under the same license, or a compatible license.
86+
87+
* Any contribution which contains commits that are not Signed-Off will not be accepted.
88+
89+
* To sign off on a commit you simply use the `--signoff` (or `-s`) option when committing your changes:
90+
```bash
91+
$ git commit -s -m "Add cool feature."
92+
```
93+
This will append the following to your commit message:
94+
```
95+
Signed-off-by: Your Name <[email protected]>
96+
```
97+
98+
* Full text of the DCO:
99+
100+
```
101+
Developer Certificate of Origin
102+
Version 1.1
103+
104+
Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
105+
1 Letterman Drive
106+
Suite D4700
107+
San Francisco, CA, 94129
108+
109+
Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed.
110+
```
111+
112+
```
113+
Developer's Certificate of Origin 1.1
114+
115+
By making a contribution to this project, I certify that:
116+
117+
(a) The contribution was created in whole or in part by me and I have the right to submit it under the open source license indicated in the file; or
118+
119+
(b) The contribution is based upon previous work that, to the best of my knowledge, is covered under an appropriate open source license and I have the right under that license to submit that work with modifications, whether created in whole or in part by me, under the same open source license (unless I am permitted to submit under a different license), as indicated in the file; or
120+
121+
(c) The contribution was provided directly to me by some other person who certified (a), (b) or (c) and I have not modified it.
122+
123+
(d) I understand and agree that this project and the contribution are public and that a record of the contribution (including all personal information I submit with it, including my sign-off) is maintained indefinitely and may be redistributed consistent with this project or the open source license(s) involved.
124+
```

LICENSE.txt

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.

0 commit comments

Comments
 (0)