Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
204 changes: 204 additions & 0 deletions .github/workflows/cloud-ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
name: cloud-tests

on:
# Runs for pull requests
pull_request:
branches:
- staging

permissions:
id-token: write
contents: write

jobs:
cloud-tests:
strategy:
fail-fast: true
max-parallel: 1
matrix:
system: ["2n:4g"]
include:
- arch: cuda
exclude: "no-cuda"
# - arch: rocm
# exclude : "no-rocm"

runs-on: ubuntu-latest
environment: cloud-ci

# Cancel previous jobs if a new version was pushed
concurrency:
group: "${{ github.ref }}-${{ matrix.arch }}-${{ matrix.system }}"
cancel-in-progress: true

defaults:
run:
shell: bash -el {0}

env:
MILABENCH_CONFIG: "config/standard.yaml"
MILABENCH_SYSTEM: "config/cloud-multinodes-system.yaml"
MILABENCH_BASE: "../output"
MILABENCH_ARGS: ""
MILABENCH_DASH: "no"
MILABENCH_HF_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
ARM_TENANT_ID: "${{ secrets.ARM_TENANT_ID }}"
ARM_SUBSCRIPTION_ID: "${{ secrets.ARM_SUBSCRIPTION_ID }}"
AZURE_CORE_OUTPUT: none
_MULTI_GPUS: "diffusion-gpus,dinov2-giant-gpus,lightning-gpus,llava-gpus,resnet152-ddp-gpus,llm-full-mp-gpus,llm-lora-ddp-gpus,llm-lora-mp-gpus"
_MULTI_NODES: "multinode"

steps:
- uses: actions/checkout@v3
with:
token: ${{ github.token }}

- uses: actions/setup-python@v2
with:
python-version: '3.10'

# Follow
# https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/guides/service_principal_client_secret
# to generate a clientId as well as a clientSecret
- name: Azure login
uses: azure/login@v2
with:
creds: |
{
"clientId": "${{ secrets.ARM_CLIENT_ID }}",
"clientSecret": "${{ secrets.ARM_CLIENT_SECRET }}",
"subscriptionId": "${{ secrets.ARM_SUBSCRIPTION_ID }}",
"tenantId": "${{ secrets.ARM_TENANT_ID }}"
}

- name: dependencies
run: |
python -m pip install -U pip
python -m pip install -U poetry
poetry lock --no-update
poetry install

- name: setup cloud credentials
run: |
mkdir -p ~/.aws
mkdir -p ~/.ssh/covalent
echo "${{ secrets.COVALENT_EC2_EXECUTOR_KEYPAIR }}" >~/.ssh/covalent/covalent-ec2-executor-keypair.pem
echo "[default]" >~/.aws/credentials
echo "aws_access_key_id=${{ secrets.AWS_ACCESS_KEY_ID }}" >>~/.aws/credentials
echo "aws_secret_access_key=${{ secrets.AWS_SECRET_ACCESS_KEY }}" >>~/.aws/credentials
chmod -R a-rwx,u+rwX ~/.aws ~/.ssh

- name: start covalent server
run: |
poetry run -- python3 -m milabench.scripts.covalent serve start --develop

- name: setup cloud
run: |
nodes=$(echo "${{ matrix.system }}" | cut -d":" -f1)
gpus=$(echo "${{ matrix.system }}" | cut -d":" -f2)
case "$nodes" in
"1n")
MILABENCH_SYSTEM="config/cloud-system.yaml"
EXCLUDE="$EXCLUDE,$_MULTI_NODES"
;;
"2n")
MILABENCH_SYSTEM="config/cloud-multinodes-system.yaml"
SELECT="$SELECT,$_MULTI_NODES"
EXCLUDE="$EXCLUDE,$_MULTI_GPUS"
;;
*)
exit 1
;;
esac
case "$gpus" in
"1g")
RUN_ON="azure__a100"
EXCLUDE="$EXCLUDE,$_MULTI_GPUS,$_MULTI_NODES"
;;
# "2g")
# RUN_ON="azure__a100_x2"
# SELECT="$SELECT,$_MULTI_GPUS"
# ;;
"4g")
RUN_ON="azure__a100_x4"
SELECT="$SELECT,$_MULTI_GPUS"
;;
*)
exit 1
;;
esac

if [[ -z "$(echo "$SELECT" | cut -d"," -f1)" ]]
then
SELECT="$(echo "$SELECT" | cut -d"," -f2-)"
fi

if [[ -z "$(echo "$EXCLUDE" | cut -d"," -f1)" ]]
then
EXCLUDE="$(echo "$EXCLUDE" | cut -d"," -f2-)"
fi

if [[ ! -z "$SELECT" ]]
then
SELECT="--select $SELECT"
fi

if [[ ! -z "$EXCLUDE" ]]
then
EXCLUDE="--exclude $EXCLUDE"
fi

echo "RUN_ON=$RUN_ON" >>$GITHUB_ENV

poetry run milabench cloud \
--setup \
--run-on $RUN_ON \
--system "$MILABENCH_SYSTEM" >$MILABENCH_SYSTEM.$RUN_ON

echo "MILABENCH_SYSTEM=$MILABENCH_SYSTEM.$RUN_ON" >>$GITHUB_ENV
echo "SELECT=$SELECT" >>$GITHUB_ENV
echo "EXCLUDE=$EXCLUDE" >>$GITHUB_ENV

- name: install benchmarks
run: |
poetry run milabench install --variant ${{ matrix.arch }} $SELECT $EXCLUDE

- name: prepare benchmarks
run: |
poetry run milabench prepare $SELECT $EXCLUDE

- name: run benchmarks
run: |
poetry run milabench run $SELECT $EXCLUDE

- name: Summary
run: |
git config credential.${{ github.server_url }}.username ${{ github.actor }}
git config credential.helper '!f() { test "$1" = get && echo "password=$GITHUB_TOKEN"; }; f'
git config --global user.email "[email protected]"
git config --global user.name "GitHub CI"
poetry run milabench report --push
env:
GITHUB_TOKEN: ${{ github.token }}

- name: DEBUG state file
if: always()
run: |
cat /tmp/milabench/covalent_venv/lib/python*/site-packages/covalent_azure_plugin/infra/*.tfstate

- name: teardown cloud
if: always()
run: |
if [[ -f "${MILABENCH_SYSTEM%.*}" ]]
then
export MILABENCH_SYSTEM=${MILABENCH_SYSTEM%.*}
fi
poetry run milabench cloud \
--teardown \
--run-on $RUN_ON \
--all

- name: DEBUG logs
if: always()
run: |
cat ~/.cache/covalent/covalent_ui.log
46 changes: 46 additions & 0 deletions benchmarks/_templates/simple/requirements.cpu.txt

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 3 additions & 10 deletions config/base.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ _defaults:
gpu_load_threshold: 0.5
gpu_mem_threshold: 0.5

num_machines: 1

_torchvision:
inherits: _defaults
definition: ../benchmarks/torchvision
Expand All @@ -26,7 +28,6 @@ _torchvision:
--loader: pytorch
--data: "{milabench_data}/FakeImageNet"


_torchvision_ddp:
inherits: _defaults
definition: ../benchmarks/torchvision_ddp
Expand Down Expand Up @@ -112,7 +113,6 @@ _timm:
--dataset: "FakeImageNet"
--workers: "auto({n_worker}, 8)"


_accelerate_opt:
inherits: _defaults
tags:
Expand Down Expand Up @@ -149,7 +149,6 @@ _accelerate_opt:
use_deepspeed: true
num_machines: 1


fp16:
inherits: _flops

Expand Down Expand Up @@ -389,7 +388,6 @@ brax:
--num-minibatches: 32
--num-envs: 8192


_diffusion:
inherits: _defaults
definition: ../benchmarks/diffusion
Expand Down Expand Up @@ -532,7 +530,6 @@ _llm:
definition: ../benchmarks/llm
install_group: torch


llm-lora-single:
inherits: _llm
plan:
Expand All @@ -550,7 +547,6 @@ llm-lora-single:
batch_size=8: true
gradient_accumulation_steps=8: true


llm-lora-ddp-gpus:
inherits: _llm
plan:
Expand All @@ -570,7 +566,6 @@ llm-lora-ddp-gpus:
batch_size=8: true
gradient_accumulation_steps=8: true


llm-lora-ddp-nodes:
tags:
- multinode
Expand All @@ -597,7 +592,6 @@ llm-lora-ddp-nodes:
requires_capabilities:
- "len(nodes) >= ${num_machines}"


llm-lora-mp-gpus:
inherits: _llm
plan:
Expand Down Expand Up @@ -743,7 +737,6 @@ torchatari:
--total-timesteps: 1000000
--env-id: Breakout-v5


llava-single:
inherits: _defaults
definition: ../benchmarks/llava
Expand Down Expand Up @@ -774,4 +767,4 @@ llava-gpus:
argv:
--batch_size: 1
--num_workers: 4
--gradient_accumulation_steps: 1
--gradient_accumulation_steps: 1
40 changes: 40 additions & 0 deletions config/cloud-multinodes-system.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
system:
# Nodes list
nodes:
# Alias used to reference the node
- name: manager
# Use 1.1.1.1 as an ip placeholder
ip: 1.1.1.1
port: 5000
# Use this node as the master node or not
main: true
# User to use in remote milabench operations
user: user

- name: node1
ip: 1.1.1.1
main: false
user: username

# Cloud instances profiles
cloud_profiles:
azure__a100:
username: ubuntu
size: Standard_NC24ads_A100_v4
location: eastus2
disk_size: 512
azure__a100_x2:
username: ubuntu
size: Standard_NC48ads_A100_v4
location: eastus2
disk_size: 512
azure__a100_x4:
username: ubuntu
size: Standard_NC96ads_A100_v4
location: eastus2
disk_size: 512
azure__a10_x2:
username: ubuntu
size: Standard_NV72ads_A10_v5
location: eastus2
disk_size: 512
Loading
Loading