Skip to content
This repository was archived by the owner on May 6, 2025. It is now read-only.

Commit 2bb68e5

Browse files
committed
ci: adding HPU agents
1 parent 7acaf26 commit 2bb68e5

File tree

3 files changed

+117
-4
lines changed

3 files changed

+117
-4
lines changed

.azure/ci-testig-parameterized.yml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ schedules:
1414
include: ["main"]
1515

1616
jobs:
17-
- template: testing-template.yml
17+
- template: cuda-template.yml
1818
parameters:
1919
configs:
2020
- "Lightning-AI/metrics_pl-develop.yaml"
@@ -24,3 +24,9 @@ jobs:
2424
- "microsoft/deepspeed-release.yaml"
2525
- "neptune-ai/lightning_integration.yaml"
2626
- "manujosephv/pytorch-tabular_lit-release.yaml"
27+
28+
- template: habana-template.yml
29+
parameters:
30+
configs:
31+
- "Lightning-AI/metrics_pl-develop.yaml"
32+
- "Lightning-AI/metrics_pl-release.yaml"

.azure/testing-template.yml renamed to .azure/cuda-template.yml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,6 @@ jobs:
3636
timeoutInMinutes: 75
3737
# how much time to give 'run always even if cancelled tasks' before stopping them
3838
cancelTimeoutInMinutes: 2
39-
workspace:
40-
clean: all
4139

4240
pool: 'lit-rtx-3090'
4341
# this need to have installed docker in the base image...
@@ -47,6 +45,9 @@ jobs:
4745
# image: "nvcr.io/nvidia/pytorch:21.11-py3"
4846
image: "pytorch/pytorch:1.13.0-cuda11.6-cudnn8-runtime"
4947
options: "--gpus=all --shm-size=8g -v /usr/bin/docker:/tmp/docker:ro"
48+
workspace:
49+
clean: all
50+
5051
steps:
5152

5253
- bash: |
@@ -70,7 +71,7 @@ jobs:
7071
7172
- bash: |
7273
sudo apt-get update -q --fix-missing
73-
sudo apt-get install -q -y build-essential gcc g++ cmake git unzip tree --no-install-recommends
74+
sudo apt-get install -q -y --no-install-recommends build-essential gcc g++ cmake git unzip tree
7475
# Python's dependencies
7576
pip --version
7677
pip install -r requirements.txt

.azure/habana-template.yml

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
jobs:
2+
3+
- job: check_diff
4+
pool:
5+
vmImage: 'Ubuntu-20.04'
6+
steps:
7+
- bash: |
8+
pip --version
9+
pip install -q -r requirements.txt
10+
pip list
11+
displayName: 'Install dependencies'
12+
13+
- script: |
14+
echo $PR_NUMBER
15+
CONFIGS=$(python _actions/assistant.py changed_configs $PR_NUMBER --as_list=False 2>&1)
16+
printf "Changed configs: $CONFIGS\n"
17+
echo "##vso[task.setvariable variable=diff;isOutput=true]$CONFIGS"
18+
name: files
19+
env:
20+
PR_NUMBER: "$(System.PullRequest.PullRequestNumber)"
21+
displayName: 'Config diff'
22+
23+
24+
- ${{ each config in parameters.configs }}:
25+
- job:
26+
displayName: ${{config}}
27+
dependsOn: check_diff
28+
variables:
29+
# map the output variable from A into this job
30+
configs: $[ dependencies.check_diff.outputs['files.diff'] ]
31+
config: "${{ config }}"
32+
33+
condition: or(eq(variables['Build.SourceBranch'], 'refs/heads/main'), contains(variables['configs'], variables['config']))
34+
# how long to run the job before automatically cancelling
35+
timeoutInMinutes: 75
36+
# how much time to give 'run always even if cancelled tasks' before stopping them
37+
cancelTimeoutInMinutes: 2
38+
39+
pool: 'intel-hpus'
40+
# this need to have installed docker in the base image...
41+
container:
42+
image: "vault.habana.ai/gaudi-docker/1.8.0/ubuntu20.04/habanalabs/pytorch-installer-1.13.1:latest"
43+
options: "--runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host --shm-size=4g -v /usr/bin/docker:/tmp/docker:ro"
44+
workspace:
45+
clean: all
46+
47+
steps:
48+
49+
- script: |
50+
container_id=$(head -1 /proc/self/cgroup|cut -d/ -f3)
51+
/tmp/docker exec -t -u 0 $container_id \
52+
sh -c "apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -o Dpkg::Options::="--force-confold" -y install sudo"
53+
echo "##vso[task.setvariable variable=CONTAINER_ID]$container_id"
54+
displayName: 'Install Sudo in container (thanks Microsoft!)'
55+
56+
- bash: |
57+
whoami && id
58+
sudo apt-get install -q -y hwinfo
59+
hwinfo --short
60+
python --version
61+
python --version
62+
pip --version
63+
pip list
64+
displayName: 'Image info & HW'
65+
66+
- bash: |
67+
sudo apt-get update -q --fix-missing
68+
sudo apt-get install -q -y --no-install-recommends build-essential gcc g++ cmake git unzip tree
69+
# Python's dependencies
70+
pip --version
71+
pip install -r requirements.txt
72+
pip list
73+
displayName: 'Install dependencies'
74+
75+
#- bash: |
76+
# echo $CONTAINER_ID
77+
# displayName: 'Sanity check'
78+
79+
- bash: |
80+
python _actions/assistant.py prepare_env --config_file=${{config}} > prepare_env.sh
81+
cat prepare_env.sh
82+
displayName: 'Create scripts'
83+
84+
- bash: |
85+
bash prepare_env.sh
86+
# pip list
87+
tree .
88+
displayName: 'Prepare env.'
89+
90+
- script: |
91+
ENVS=$(python _actions/assistant.py list_env --config_file=${{config}} --export 2>&1)
92+
printf "PyTest env. variables: $ENVS\n"
93+
echo "##vso[task.setvariable variable=envs;isOutput=true]$ENVS"
94+
ARGS=$(python _actions/assistant.py specify_tests --config_file=${{config}} 2>&1)
95+
printf "PyTest arguments: $ARGS\n"
96+
echo "##vso[task.setvariable variable=args;isOutput=true]$ARGS"
97+
name: testing
98+
displayName: 'testing specs'
99+
100+
- bash: |
101+
$(testing.envs)
102+
python -m pytest $(testing.args) -v
103+
workingDirectory: _integrations
104+
displayName: 'Integration tests'
105+
106+
# ToDo: add Slack notification

0 commit comments

Comments
 (0)