Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
134 changes: 134 additions & 0 deletions .github/launch_actions_runner.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
#!/bin/bash
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=4
#SBATCH --mem=32G
#SBATCH --gpus=1
#SBATCH --time=00:30:00
#SBATCH --output=logs/runner_%j.out

## This script can be used to launch a new self-hosted GitHub runner.
## It can be launched with `sbatch` on a SLURM cluster, or run directly on a local machine.
## It assumes that the SH_TOKEN environment variable can be read from $HOME/.bash_aliases
## and contains a GitHub token that is used to authenticate with the GitHub API to launch
## a new self-hosted runner.
set -euo pipefail
set -o errexit
# todo: might cause issues if running this script on a local machine since $SCRATCH and
# $SLURM_TMPDIR won't be set.
set -o nounset

# Seems to be required for the `uvx` to be found. (adds $HOME/.cargo/bin to PATH)
source $HOME/.cargo/env
# This is where the SH_TOKEN secret environment variable is set.
source $HOME/.bash_aliases

readonly repo="mila-iqia/mila-docs"
readonly action_runner_version="2.317.0"
readonly expected_checksum_for_version="9e883d210df8c6028aff475475a457d380353f9d01877d51cc01a17b2a91161d"


# Check for required commands.
for cmd in curl tar uvx; do
if ! command -v $cmd &> /dev/null; then
echo "Error: $cmd is not installed."
exit 1
fi
done

if [ -z "${SH_TOKEN:-}" ]; then
echo "Error: SH_TOKEN environment variable is not set."
echo "This script requires the SH_TOKEN environment variable be set to a GitHub token with permissions to create new self-hosted runners for the current repository."
echo "To create this token, Follow the docs here: "
echo " - https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens#creating-a-fine-grained-personal-access-token"
echo " - and click here to create the new token: https://github.com/settings/personal-access-tokens/new"
echo "The fine-grained token must have the 'Administration - repository permissions (write)' scope."
exit 1
fi

archive="actions-runner-linux-x64-$action_runner_version.tar.gz"

# Look for the actions-runner archive.
# 1. If SLURM_TMPDIR is set:
# - set WORKDIR to $SLURM_TMPDIR
# - check if the archive doesn't exist in $SCRATCH
# - if it doesn't exist, download the archive from GitHub.
# - Make a symlink to it in $SLURM_TMPDIR.
# 2. Otherwise, use ~/actions-runners/$repo as the WORKDIR, and download the archive from GitHub if
#it isn't already there.

if [ -n "${SLURM_TMPDIR:-}" ]; then
# This was launched with sbatch on a SLURM cluster.
WORKDIR=$SLURM_TMPDIR
if [ ! -f "$SCRATCH/$archive" ]; then
curl --fail -o "$SCRATCH/$archive" \
-L "https://github.com/actions/runner/releases/download/v$action_runner_version/$archive"
fi
if [ ! -L "$WORKDIR/$archive" ]; then
ln -s "$SCRATCH/$archive" "$WORKDIR/$archive"
fi
else
# This was launched as a script on a local or dev machine or in a non-SLURM environment.
WORKDIR=$HOME/actions-runners/$repo
mkdir -p $WORKDIR
if [ ! -f "$WORKDIR/$archive" ]; then
curl --fail -o "$WORKDIR/$archive" \
-L "https://github.com/actions/runner/releases/download/v$action_runner_version/$archive"
fi
fi
echo "Setting up self-hosted runner in $WORKDIR"
cd $WORKDIR


# Check the archive integrity.
echo "$expected_checksum_for_version $archive" | shasum -a 256 -c
# Extract the installer
tar xzf $archive
# Use the GitHub API to get a temporary registration token for a new self-hosted runner.
# This requires you to be an admin of the repository and to have the $SH_TOKEN secret set to a
# github token with (ideally only) the appropriate permissions.
# https://docs.github.com/en/rest/actions/self-hosted-runners?apiVersion=2022-11-28#create-a-registration-token-for-a-repository
# Example output:
# {
# "token": "XXXXX",
# "expires_at": "2020-01-22T12:13:35.123-08:00"
# }
t=$(tempfile) || exit
trap "rm -f -- '$t'" EXIT

# Write headers to the tempfile
cat <<EOF > "$t"
Accept: application/vnd.github+json
Authorization: Bearer $SH_TOKEN
X-GitHub-Api-Version: 2022-11-28
EOF

# Uses `uvx python` to just get python. Assumes that `uv` is already installed.
TOKEN=`curl --fail -L \
-X POST \
-H @$t \
https://api.github.com/repos/$repo/actions/runners/registration-token | \
uvx python -c "import sys, json; print(json.load(sys.stdin)['token'])"`

rm -f -- "$t"
trap - EXIT


# IF SLURM_CLUSTER_NAME is set, we're on a SLURM cluster, so configure the worker with --ephemeral.
export cluster=${SLURM_CLUSTER_NAME:-}
echo "Cluster name: $cluster"
# Create the runner and configure it programmatically with the token we just got
# from the GitHub API.

# For now, don't exit if the runner is already configured.
# This way, we might have more than one github runner job running at once.
./config.sh --url https://github.com/$repo --token $TOKEN \
--unattended --replace --labels $cluster self-hosted ${SLURM_CLUSTER_NAME:+--ephemeral} || true

# Setting these environment variables which are normally be set by gh-actions when running in the cloud,
# so they are visible in the python script. Unclear why gh-actions doesn't set these on a self-hosted runner.
export GITHUB_ACTIONS="true"
export RUNNER_LABELS="self-hosted,$cluster"

# Launch the actions runner.
exec ./run.sh
45 changes: 45 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,48 @@ jobs:

- name: Run files generation tests
run: pre-commit run --all-files && [[ -z "$(git status -s)" ]]

launch-slurm-actions-runner:
needs: [generate_files]
runs-on: self-hosted
timeout-minutes: 5
steps:
- uses: actions/checkout@v4
- name: Copy job script to the cluster
# The script will be overwritten by different CI runs, but it shouldn't really
# change, so not a big deal.
# TODO: there are some assumptions about the GPU type to use in that script.
run: "scp .github/launch_actions_runner.sh mila:launch_actions_runner.sh"

- name: Launch Slurm Actions Runner
# TODO: Hard-coded mila-specific GPU to use for running tests.
# This isn't great, but currently necessary for reproducibility of tests.
run: ssh mila 'cd $SCRATCH && sbatch --gpus=l40s:1 --parsable $HOME/launch_actions_runner.sh'

# This step runs in a self-hosted Github Actions runner inside a SLURM job on a compute node of the cluster.
slurm_integration_tests:
name: Run integration tests on the Mila cluster in a SLURM job
needs: [launch-slurm-actions-runner]
runs-on: ["self-hosted", "mila"]
timeout-minutes: 30
steps:
- uses: actions/checkout@v4
- name: Install the latest version of uv
uses: astral-sh/setup-uv@v6
with:
version: "latest"
# https://github.com/astral-sh/setup-uv?tab=readme-ov-file#github-authentication-token
github-token: ${{ secrets.GITHUB_TOKEN }}
- name: Install dependencies
run: uv sync --all-extras --frozen
- name: Show installed packages
run: uv pip list
- name: Test with pytest
run: uv run pytest -v --gen-missing --cov=project --cov-report=xml --cov-append

# TODO: Add code coverage for the examples?
# - name: Store coverage report as an artifact
# uses: actions/upload-artifact@v4
# with:
# name: coverage-reports-slurm-integration-tests-mila
# path: ./coverage.xml
10 changes: 3 additions & 7 deletions docs/examples/distributed/multi_gpu/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ Click here to see `the code for this example
#SBATCH --gpus-per-task=rtx8000:1
#SBATCH --cpus-per-task=4
-#SBATCH --ntasks-per-node=1
+#SBATCH --ntasks-per-node=4
+#SBATCH --ntasks-per-node=2
+#SBATCH --nodes=1
#SBATCH --mem=16G
#SBATCH --time=00:15:00
Expand All @@ -46,13 +46,9 @@ Click here to see `the code for this example
module load anaconda/3
module load cuda/11.7

# Creating the environment for the first time:
# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
# pytorch-cuda=11.7 -c pytorch -c nvidia
# Other conda packages:
# conda install -y -n pytorch -c conda-forge rich tqdm

+
# Activate pre-existing environment.
# NOTE: Use the `make_env.sh` script to create the environment if you haven't already.
conda activate pytorch


Expand Down
8 changes: 2 additions & 6 deletions docs/examples/distributed/multi_gpu/job.sh
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash
#SBATCH --gpus-per-task=rtx8000:1
#SBATCH --cpus-per-task=4
#SBATCH --ntasks-per-node=4
#SBATCH --ntasks-per-node=2
#SBATCH --nodes=1
#SBATCH --mem=16G
#SBATCH --time=00:15:00
Expand All @@ -19,13 +19,9 @@ module --quiet purge
module load anaconda/3
module load cuda/11.7

# Creating the environment for the first time:
# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
# pytorch-cuda=11.7 -c pytorch -c nvidia
# Other conda packages:
# conda install -y -n pytorch -c conda-forge rich tqdm

# Activate pre-existing environment.
# NOTE: Use the `make_env.sh` script to create the environment if you haven't already.
conda activate pytorch


Expand Down
10 changes: 3 additions & 7 deletions docs/examples/distributed/multi_node/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ Click here to see `the source code for this example
#!/bin/bash
#SBATCH --gpus-per-task=rtx8000:1
#SBATCH --cpus-per-task=4
#SBATCH --ntasks-per-node=4
#SBATCH --ntasks-per-node=2
-#SBATCH --nodes=1
+#SBATCH --nodes=2
#SBATCH --mem=16G
Expand All @@ -48,13 +48,9 @@ Click here to see `the source code for this example
module load anaconda/3
module load cuda/11.7

# Creating the environment for the first time:
# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
# pytorch-cuda=11.7 -c pytorch -c nvidia
# Other conda packages:
# conda install -y -n pytorch -c conda-forge rich tqdm

-
# Activate pre-existing environment.
# NOTE: Use the `make_env.sh` script to create the environment if you haven't already.
conda activate pytorch

-
Expand Down
9 changes: 2 additions & 7 deletions docs/examples/distributed/multi_node/job.sh
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash
#SBATCH --gpus-per-task=rtx8000:1
#SBATCH --cpus-per-task=4
#SBATCH --ntasks-per-node=4
#SBATCH --ntasks-per-node=2
#SBATCH --nodes=2
#SBATCH --mem=16G
#SBATCH --time=00:15:00
Expand All @@ -19,13 +19,8 @@ module --quiet purge
module load anaconda/3
module load cuda/11.7

# Creating the environment for the first time:
# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
# pytorch-cuda=11.7 -c pytorch -c nvidia
# Other conda packages:
# conda install -y -n pytorch -c conda-forge rich tqdm

# Activate pre-existing environment.
# NOTE: Use the `make_env.sh` script to create the environment if you haven't already.
conda activate pytorch

# Stage dataset into $SLURM_TMPDIR (only on the first worker of each node)
Expand Down
7 changes: 1 addition & 6 deletions docs/examples/distributed/single_gpu/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,8 @@ repository.
module load anaconda/3
module load cuda/11.7

# Creating the environment for the first time:
# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
# pytorch-cuda=11.7 -c pytorch -c nvidia
# Other conda packages:
# conda install -y -n pytorch -c conda-forge rich tqdm

# Activate pre-existing environment.
# NOTE: Use the `make_env.sh` script to create the environment if you haven't already.
conda activate pytorch


Expand Down
7 changes: 1 addition & 6 deletions docs/examples/distributed/single_gpu/job.sh
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,8 @@ module --quiet purge
module load anaconda/3
module load cuda/11.7

# Creating the environment for the first time:
# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
# pytorch-cuda=11.7 -c pytorch -c nvidia
# Other conda packages:
# conda install -y -n pytorch -c conda-forge rich tqdm

# Activate pre-existing environment.
# NOTE: Use the `make_env.sh` script to create the environment if you haven't already.
conda activate pytorch


Expand Down
15 changes: 7 additions & 8 deletions docs/examples/frameworks/jax/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ repository.
.. code:: diff
# distributed/single_gpu/job.sh -> frameworks/jax/job.sh
old mode 100755
new mode 100644
#!/bin/bash
#SBATCH --gpus-per-task=rtx8000:1
#SBATCH --cpus-per-task=4
Expand All @@ -45,11 +47,10 @@ repository.
module load anaconda/3
-module load cuda/11.7
# Creating the environment for the first time:
-# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
-# pytorch-cuda=11.7 -c pytorch -c nvidia
-# Other conda packages:
-# conda install -y -n pytorch -c conda-forge rich tqdm
-# Activate pre-existing environment.
-# NOTE: Use the `make_env.sh` script to create the environment if you haven't already.
-conda activate pytorch
+# Creating the environment for the first time:
+# conda create -y -n jax_ex -c "nvidia/label/cuda-11.8.0" cuda python=3.9 virtualenv pip
+# conda activate jax_ex
+# Install Jax using `pip`
Expand All @@ -59,9 +60,7 @@ repository.
+# -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+# Other pip packages:
+# pip install pillow optax rich torch torchvision flax tqdm
-# Activate pre-existing environment.
-conda activate pytorch
+
+# Activate the environment:
+conda activate jax_ex
Expand Down
11 changes: 5 additions & 6 deletions docs/examples/frameworks/jax_setup/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ repository.
.. code:: diff
# frameworks/pytorch_setup/job.sh -> frameworks/jax_setup/job.sh
old mode 100755
new mode 100644
#!/bin/bash
#SBATCH --gres=gpu:1
#SBATCH --cpus-per-task=1
Expand All @@ -39,20 +41,17 @@ repository.
# See https://docs.mila.quebec/Userguide.html#conda for more information.
module load anaconda/3
# Creating the environment for the first time:
-# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
-# pytorch-cuda=11.6 -c pytorch -c nvidia
-# Other conda packages:
-# conda install -y -n pytorch -c conda-forge rich
+# Creating the environment for the first time:
+# conda create -y -n jax_ex -c "nvidia/label/cuda-11.8.0" cuda python=3.9 virtualenv pip
+# conda activate jax_ex
+# Install Jax using `pip`
+# *Please note* that as soon as you install packages from `pip install`, you
+# should not install any more packages using `conda install`
+# pip install --upgrade "jax[cuda11_pip]" \
+# -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+
# Activate the environment:
-# NOTE: Use the `make_env.sh` script to create the environment if you haven't already.
-conda activate pytorch
+conda activate jax_ex
Expand Down
Loading
Loading