mila-iqia · lebrice · Jul 24, 2023 · Aug 7, 2023 · Aug 7, 2023 · Aug 7, 2023
@@ -0,0 +1,134 @@
+#!/bin/bash
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --cpus-per-task=4
+#SBATCH --mem=32G
+#SBATCH --gpus=1
+#SBATCH --time=00:30:00
+#SBATCH --output=logs/runner_%j.out
+
+## This script can be used to launch a new self-hosted GitHub runner.
+## It can be launched with `sbatch` on a SLURM cluster, or run directly on a local machine.
+## It assumes that the SH_TOKEN environment variable can be read from $HOME/.bash_aliases
+## and contains a GitHub token that is used to authenticate with the GitHub API to launch
+## a new self-hosted runner.
+set -euo pipefail
+set -o errexit
+# todo: might cause issues if running this script on a local machine since $SCRATCH and
+# $SLURM_TMPDIR won't be set.
+set -o nounset
+
+# Seems to be required for the `uvx` to be found. (adds $HOME/.cargo/bin to PATH)
+source $HOME/.cargo/env
+# This is where the SH_TOKEN secret environment variable is set.
+source $HOME/.bash_aliases
+
+readonly repo="mila-iqia/mila-docs"
+readonly action_runner_version="2.317.0"
+readonly expected_checksum_for_version="9e883d210df8c6028aff475475a457d380353f9d01877d51cc01a17b2a91161d"
+
+
+# Check for required commands.
+for cmd in curl tar uvx; do
+    if ! command -v $cmd &> /dev/null; then
+        echo "Error: $cmd is not installed."
+        exit 1
+    fi
+done
+
+if [ -z "${SH_TOKEN:-}" ]; then
+    echo "Error: SH_TOKEN environment variable is not set."
+    echo "This script requires the SH_TOKEN environment variable be set to a GitHub token with permissions to create new self-hosted runners for the current repository."
+    echo "To create this token, Follow the docs here: "
+    echo " - https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens#creating-a-fine-grained-personal-access-token"
+    echo " - and click here to create the new token: https://github.com/settings/personal-access-tokens/new"
+    echo "The fine-grained token must have the 'Administration - repository permissions (write)' scope."
+    exit 1
+fi
+
+archive="actions-runner-linux-x64-$action_runner_version.tar.gz"
+
+# Look for the actions-runner archive.
+# 1. If SLURM_TMPDIR is set:
+#     - set WORKDIR to $SLURM_TMPDIR
+#     - check if the archive doesn't exist in $SCRATCH
+#     - if it doesn't exist, download the archive from GitHub.
+#     - Make a symlink to it in $SLURM_TMPDIR.
+# 2. Otherwise, use ~/actions-runners/$repo as the WORKDIR, and download the archive from GitHub if
+#it isn't already there.
+
+if [ -n "${SLURM_TMPDIR:-}" ]; then
+    # This was launched with sbatch on a SLURM cluster.
+    WORKDIR=$SLURM_TMPDIR
+    if [ ! -f "$SCRATCH/$archive" ]; then
+        curl --fail -o "$SCRATCH/$archive" \
+            -L "https://github.com/actions/runner/releases/download/v$action_runner_version/$archive"
+    fi
+    if [ ! -L "$WORKDIR/$archive" ]; then
+        ln -s "$SCRATCH/$archive" "$WORKDIR/$archive"
+    fi
+else
+    # This was launched as a script on a local or dev machine or in a non-SLURM environment.
+    WORKDIR=$HOME/actions-runners/$repo
+    mkdir -p $WORKDIR
+    if [ ! -f "$WORKDIR/$archive" ]; then
+        curl --fail -o "$WORKDIR/$archive" \
+            -L "https://github.com/actions/runner/releases/download/v$action_runner_version/$archive"
+    fi
+fi
+echo "Setting up self-hosted runner in $WORKDIR"
+cd $WORKDIR
+
+
+# Check the archive integrity.
+echo "$expected_checksum_for_version  $archive" | shasum -a 256 -c
+# Extract the installer
+tar xzf $archive
+# Use the GitHub API to get a temporary registration token for a new self-hosted runner.
+# This requires you to be an admin of the repository and to have the $SH_TOKEN secret set to a
+# github token with (ideally only) the appropriate permissions.
+# https://docs.github.com/en/rest/actions/self-hosted-runners?apiVersion=2022-11-28#create-a-registration-token-for-a-repository
+# Example output:
+# {
+#   "token": "XXXXX",
+#   "expires_at": "2020-01-22T12:13:35.123-08:00"
+# }
+t=$(tempfile) || exit
+trap "rm -f -- '$t'" EXIT
+
+# Write headers to the tempfile
+cat <<EOF > "$t"
+Accept: application/vnd.github+json
+Authorization: Bearer $SH_TOKEN
+X-GitHub-Api-Version: 2022-11-28
+EOF
+
+# Uses `uvx python` to just get python. Assumes that `uv` is already installed.
+TOKEN=`curl --fail -L \
+  -X POST \
+  -H @$t \
+  https://api.github.com/repos/$repo/actions/runners/registration-token | \
+  uvx python -c "import sys, json; print(json.load(sys.stdin)['token'])"`
+
+rm -f -- "$t"
+trap - EXIT
+
+
+# IF SLURM_CLUSTER_NAME is set, we're on a SLURM cluster, so configure the worker with --ephemeral.
+export cluster=${SLURM_CLUSTER_NAME:-}
+echo "Cluster name: $cluster"
+# Create the runner and configure it programmatically with the token we just got
+# from the GitHub API.
+
+# For now, don't exit if the runner is already configured.
+# This way, we might have more than one github runner job running at once.
+./config.sh --url https://github.com/$repo --token $TOKEN \
+  --unattended --replace --labels $cluster self-hosted ${SLURM_CLUSTER_NAME:+--ephemeral} || true
+
+# Setting these environment variables which are normally be set by gh-actions when running in the cloud,
+# so they are visible in the python script. Unclear why gh-actions doesn't set these on a self-hosted runner.
+export GITHUB_ACTIONS="true"
+export RUNNER_LABELS="self-hosted,$cluster"
+
+# Launch the actions runner.
+exec ./run.sh
@@ -20,3 +20,48 @@ jobs:
 
     - name: Run files generation tests
       run:  pre-commit run --all-files && [[ -z "$(git status -s)" ]]
+
+  launch-slurm-actions-runner:
+    needs: [generate_files]
+    runs-on: self-hosted
+    timeout-minutes: 5
+    steps:
+      - uses: actions/checkout@v4
+      - name: Copy job script to the cluster
+        # The script will be overwritten by different CI runs, but it shouldn't really
+        # change, so not a big deal.
+        # TODO: there are some assumptions about the GPU type to use in that script.
+        run: "scp .github/launch_actions_runner.sh mila:launch_actions_runner.sh"
+
+      - name: Launch Slurm Actions Runner
+        # TODO: Hard-coded mila-specific GPU to use for running tests.
+        # This isn't great, but currently necessary for reproducibility of tests.
+        run: ssh mila 'cd $SCRATCH && sbatch --gpus=l40s:1 --parsable $HOME/launch_actions_runner.sh'
+
+  # This step runs in a self-hosted Github Actions runner inside a SLURM job on a compute node of the cluster.
+  slurm_integration_tests:
+    name: Run integration tests on the Mila cluster in a SLURM job
+    needs: [launch-slurm-actions-runner]
+    runs-on: ["self-hosted", "mila"]
+    timeout-minutes: 30
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install the latest version of uv
+        uses: astral-sh/setup-uv@v6
+        with:
+          version: "latest"
+          # https://github.com/astral-sh/setup-uv?tab=readme-ov-file#github-authentication-token
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+      - name: Install dependencies
+        run: uv sync --all-extras --frozen
+      - name: Show installed packages
+        run: uv pip list
+      - name: Test with pytest
+        run: uv run pytest -v --gen-missing --cov=project --cov-report=xml --cov-append
+
+      # TODO: Add code coverage for the examples?
+      # - name: Store coverage report as an artifact
+      #   uses: actions/upload-artifact@v4
+      #   with:
+      #     name: coverage-reports-slurm-integration-tests-mila
+      #     path: ./coverage.xml
@@ -28,7 +28,7 @@ Click here to see `the code for this example
     #SBATCH --gpus-per-task=rtx8000:1
     #SBATCH --cpus-per-task=4
    -#SBATCH --ntasks-per-node=1
-   +#SBATCH --ntasks-per-node=4
+   +#SBATCH --ntasks-per-node=2
    +#SBATCH --nodes=1
     #SBATCH --mem=16G
     #SBATCH --time=00:15:00
@@ -46,13 +46,9 @@ Click here to see `the code for this example
     module load anaconda/3
     module load cuda/11.7
 
-    # Creating the environment for the first time:
-    # conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
-    #     pytorch-cuda=11.7 -c pytorch -c nvidia
-    # Other conda packages:
-    # conda install -y -n pytorch -c conda-forge rich tqdm
-
+   +
     # Activate pre-existing environment.
+    # NOTE: Use the `make_env.sh` script to create the environment if you haven't already.
     conda activate pytorch
 
 

@@ -1,7 +1,7 @@
 #!/bin/bash
 #SBATCH --gpus-per-task=rtx8000:1
 #SBATCH --cpus-per-task=4
-#SBATCH --ntasks-per-node=4
+#SBATCH --ntasks-per-node=2
 #SBATCH --nodes=1
 #SBATCH --mem=16G
 #SBATCH --time=00:15:00
@@ -19,13 +19,9 @@ module --quiet purge
 module load anaconda/3
 module load cuda/11.7
 
-# Creating the environment for the first time:
-# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
-#     pytorch-cuda=11.7 -c pytorch -c nvidia
-# Other conda packages:
-# conda install -y -n pytorch -c conda-forge rich tqdm
 
 # Activate pre-existing environment.
+# NOTE: Use the `make_env.sh` script to create the environment if you haven't already.
 conda activate pytorch
 
 

@@ -29,7 +29,7 @@ Click here to see `the source code for this example
     #!/bin/bash
     #SBATCH --gpus-per-task=rtx8000:1
     #SBATCH --cpus-per-task=4
-    #SBATCH --ntasks-per-node=4
+    #SBATCH --ntasks-per-node=2
    -#SBATCH --nodes=1
    +#SBATCH --nodes=2
     #SBATCH --mem=16G
@@ -48,13 +48,9 @@ Click here to see `the source code for this example
     module load anaconda/3
     module load cuda/11.7
 
-    # Creating the environment for the first time:
-    # conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
-    #     pytorch-cuda=11.7 -c pytorch -c nvidia
-    # Other conda packages:
-    # conda install -y -n pytorch -c conda-forge rich tqdm
-
+   -
     # Activate pre-existing environment.
+    # NOTE: Use the `make_env.sh` script to create the environment if you haven't already.
     conda activate pytorch
 
    -

@@ -1,7 +1,7 @@
 #!/bin/bash
 #SBATCH --gpus-per-task=rtx8000:1
 #SBATCH --cpus-per-task=4
-#SBATCH --ntasks-per-node=4
+#SBATCH --ntasks-per-node=2
 #SBATCH --nodes=2
 #SBATCH --mem=16G
 #SBATCH --time=00:15:00
@@ -19,13 +19,8 @@ module --quiet purge
 module load anaconda/3
 module load cuda/11.7
 
-# Creating the environment for the first time:
-# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
-#     pytorch-cuda=11.7 -c pytorch -c nvidia
-# Other conda packages:
-# conda install -y -n pytorch -c conda-forge rich tqdm
-
 # Activate pre-existing environment.
+# NOTE: Use the `make_env.sh` script to create the environment if you haven't already.
 conda activate pytorch
 
 # Stage dataset into $SLURM_TMPDIR (only on the first worker of each node)

@@ -42,13 +42,8 @@ repository.
    module load anaconda/3
    module load cuda/11.7
 
-   # Creating the environment for the first time:
-   # conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
-   #     pytorch-cuda=11.7 -c pytorch -c nvidia
-   # Other conda packages:
-   # conda install -y -n pytorch -c conda-forge rich tqdm
-
    # Activate pre-existing environment.
+   # NOTE: Use the `make_env.sh` script to create the environment if you haven't already.
    conda activate pytorch
 
 

@@ -18,13 +18,8 @@ module --quiet purge
 module load anaconda/3
 module load cuda/11.7
 
-# Creating the environment for the first time:
-# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
-#     pytorch-cuda=11.7 -c pytorch -c nvidia
-# Other conda packages:
-# conda install -y -n pytorch -c conda-forge rich tqdm
-
 # Activate pre-existing environment.
+# NOTE: Use the `make_env.sh` script to create the environment if you haven't already.
 conda activate pytorch
 
 

@@ -25,6 +25,8 @@ repository.
 .. code:: diff
 
     # distributed/single_gpu/job.sh -> frameworks/jax/job.sh
+   old mode 100755
+   new mode 100644
     #!/bin/bash
     #SBATCH --gpus-per-task=rtx8000:1
     #SBATCH --cpus-per-task=4
@@ -45,11 +47,10 @@ repository.
     module load anaconda/3
    -module load cuda/11.7
 
-    # Creating the environment for the first time:
-   -# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
-   -#     pytorch-cuda=11.7 -c pytorch -c nvidia
-   -# Other conda packages:
-   -# conda install -y -n pytorch -c conda-forge rich tqdm
+   -# Activate pre-existing environment.
+   -# NOTE: Use the `make_env.sh` script to create the environment if you haven't already.
+   -conda activate pytorch
+   +# Creating the environment for the first time:
    +# conda create -y -n jax_ex -c "nvidia/label/cuda-11.8.0" cuda python=3.9 virtualenv pip
    +# conda activate jax_ex
    +# Install Jax using `pip`
@@ -59,9 +60,7 @@ repository.
    +#    -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
    +# Other pip packages:
    +# pip install pillow optax rich torch torchvision flax tqdm
-
-   -# Activate pre-existing environment.
-   -conda activate pytorch
+   +
    +# Activate the environment:
    +conda activate jax_ex
 

@@ -23,6 +23,8 @@ repository.
 .. code:: diff
 
     # frameworks/pytorch_setup/job.sh -> frameworks/jax_setup/job.sh
+   old mode 100755
+   new mode 100644
     #!/bin/bash
     #SBATCH --gres=gpu:1
     #SBATCH --cpus-per-task=1
@@ -39,20 +41,17 @@ repository.
     # See https://docs.mila.quebec/Userguide.html#conda for more information.
     module load anaconda/3
 
-    # Creating the environment for the first time:
-   -# conda create -y -n pytorch python=3.9 pytorch torchvision torchaudio \
-   -#     pytorch-cuda=11.6 -c pytorch -c nvidia
-   -# Other conda packages:
-   -# conda install -y -n pytorch -c conda-forge rich
+   +# Creating the environment for the first time:
    +# conda create -y -n jax_ex -c "nvidia/label/cuda-11.8.0" cuda python=3.9 virtualenv pip
    +# conda activate jax_ex
    +# Install Jax using `pip`
    +# *Please note* that as soon as you install packages from `pip install`, you
    +# should not install any more packages using `conda install`
    +# pip install --upgrade "jax[cuda11_pip]" \
    +#    -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
-
+   +
     # Activate the environment:
+   -# NOTE: Use the `make_env.sh` script to create the environment if you haven't already.
    -conda activate pytorch
    +conda activate jax_ex