Skip to content

Commit c3e5e0f

Browse files
authored
Merge pull request #3 from NREL/vtest-v1.157-merge
Merging v1.157
2 parents 5bb9807 + 11bcf83 commit c3e5e0f

File tree

140 files changed

+2911
-517
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

140 files changed

+2911
-517
lines changed

.github/workflows/extra.yml

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
name: Test extra build
2+
on:
3+
workflow_dispatch:
4+
push:
5+
branches:
6+
- main
7+
paths:
8+
- 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json'
9+
- 'ansible/roles/doca/**'
10+
- 'ansible/roles/cuda/**'
11+
- 'ansible/roles/lustre/**'
12+
- '.github/workflows/extra.yml'
13+
pull_request:
14+
paths:
15+
- 'environments/.stackhpc/terraform/cluster_image.auto.tfvars.json'
16+
- 'ansible/roles/doca/**'
17+
- 'ansible/roles/cuda/**'
18+
- 'ansible/roles/lustre/**'
19+
- '.github/workflows/extra.yml'
20+
21+
jobs:
22+
doca:
23+
name: extra-build
24+
concurrency:
25+
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }} # to branch/PR + OS
26+
cancel-in-progress: true
27+
runs-on: ubuntu-22.04
28+
strategy:
29+
fail-fast: false # allow other matrix jobs to continue even if one fails
30+
matrix: # build RL8, RL9
31+
build:
32+
- image_name: openhpc-extra-RL8
33+
source_image_name_key: RL8 # key into environments/.stackhpc/terraform/cluster_image.auto.tfvars.json
34+
inventory_groups: doca,cuda,lustre
35+
volume_size: 30 # needed for cuda
36+
- image_name: openhpc-extra-RL9
37+
source_image_name_key: RL9
38+
inventory_groups: doca,cuda,lustre
39+
volume_size: 30 # needed for cuda
40+
env:
41+
ANSIBLE_FORCE_COLOR: True
42+
OS_CLOUD: openstack
43+
CI_CLOUD: ${{ vars.CI_CLOUD }} # default from repo settings
44+
ARK_PASSWORD: ${{ secrets.ARK_PASSWORD }}
45+
46+
steps:
47+
- uses: actions/checkout@v2
48+
49+
- name: Load current fat images into GITHUB_ENV
50+
# see https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/workflow-commands-for-github-actions#example-of-a-multiline-string
51+
run: |
52+
{
53+
echo 'FAT_IMAGES<<EOF'
54+
cat environments/.stackhpc/terraform/cluster_image.auto.tfvars.json
55+
echo EOF
56+
} >> "$GITHUB_ENV"
57+
58+
- name: Record settings
59+
run: |
60+
echo CI_CLOUD: ${{ env.CI_CLOUD }}
61+
echo FAT_IMAGES: ${FAT_IMAGES}
62+
63+
- name: Setup ssh
64+
run: |
65+
set -x
66+
mkdir ~/.ssh
67+
echo "${{ secrets[format('{0}_SSH_KEY', env.CI_CLOUD)] }}" > ~/.ssh/id_rsa
68+
chmod 0600 ~/.ssh/id_rsa
69+
shell: bash
70+
71+
- name: Add bastion's ssh key to known_hosts
72+
run: cat environments/.stackhpc/bastion_fingerprints >> ~/.ssh/known_hosts
73+
shell: bash
74+
75+
- name: Install ansible etc
76+
run: dev/setup-env.sh
77+
78+
- name: Write clouds.yaml
79+
run: |
80+
mkdir -p ~/.config/openstack/
81+
echo "${{ secrets[format('{0}_CLOUDS_YAML', env.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml
82+
shell: bash
83+
84+
- name: Setup environment
85+
run: |
86+
. venv/bin/activate
87+
. environments/.stackhpc/activate
88+
89+
- name: Build fat image with packer
90+
id: packer_build
91+
run: |
92+
set -x
93+
. venv/bin/activate
94+
. environments/.stackhpc/activate
95+
cd packer/
96+
packer init .
97+
98+
PACKER_LOG=1 packer build \
99+
-on-error=${{ vars.PACKER_ON_ERROR }} \
100+
-var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \
101+
-var "source_image_name=${{ fromJSON(env.FAT_IMAGES)['cluster_image'][matrix.build.source_image_name_key] }}" \
102+
-var "image_name=${{ matrix.build.image_name }}" \
103+
-var "inventory_groups=${{ matrix.build.inventory_groups }}" \
104+
-var "volume_size=${{ matrix.build.volume_size }}" \
105+
openstack.pkr.hcl
106+
107+
- name: Get created image names from manifest
108+
id: manifest
109+
run: |
110+
. venv/bin/activate
111+
IMAGE_ID=$(jq --raw-output '.builds[-1].artifact_id' packer/packer-manifest.json)
112+
while ! openstack image show -f value -c name $IMAGE_ID; do
113+
sleep 5
114+
done
115+
IMAGE_NAME=$(openstack image show -f value -c name $IMAGE_ID)
116+
echo "image-name=${IMAGE_NAME}" >> "$GITHUB_OUTPUT"
117+
echo "image-id=$IMAGE_ID" >> "$GITHUB_OUTPUT"
118+
echo $IMAGE_ID > image-id.txt
119+
echo $IMAGE_NAME > image-name.txt
120+
121+
- name: Make image usable for further builds
122+
run: |
123+
. venv/bin/activate
124+
openstack image unset --property signature_verified "${{ steps.manifest.outputs.image-id }}"
125+
126+
- name: Delete image for automatically-run workflows
127+
run: |
128+
. venv/bin/activate
129+
openstack image delete "${{ steps.manifest.outputs.image-id }}"
130+
if: ${{ github.event_name != 'workflow_dispatch' }}
131+
132+
- name: Upload manifest artifact
133+
uses: actions/upload-artifact@v4
134+
with:
135+
name: image-details-${{ matrix.build.image_name }}
136+
path: |
137+
./image-id.txt
138+
./image-name.txt
139+
overwrite: true

.github/workflows/fatimage.yml

Lines changed: 22 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -15,36 +15,25 @@ jobs:
1515
openstack:
1616
name: openstack-imagebuild
1717
concurrency:
18-
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.build }} # to branch/PR + OS + build
18+
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }} # to branch/PR + OS
1919
cancel-in-progress: true
2020
runs-on: ubuntu-22.04
2121
strategy:
2222
fail-fast: false # allow other matrix jobs to continue even if one fails
23-
matrix: # build RL8+OFED, RL9+OFED, RL9+OFED+CUDA versions
24-
os_version:
25-
- RL8
26-
- RL9
23+
matrix: # build RL8, RL9
2724
build:
28-
- openstack.openhpc
29-
- openstack.openhpc-cuda
30-
exclude:
31-
- os_version: RL8
32-
build: openstack.openhpc-cuda
25+
- image_name: openhpc-RL8
26+
source_image_name: Rocky-8-GenericCloud-Base-8.10-20240528.0.x86_64.qcow2
27+
inventory_groups: control,compute,login,update
28+
- image_name: openhpc-RL9
29+
source_image_name: Rocky-9-GenericCloud-Base-9.5-20241118.0.x86_64.qcow2
30+
inventory_groups: control,compute,login,update
3331
env:
3432
ANSIBLE_FORCE_COLOR: True
3533
OS_CLOUD: openstack
3634
CI_CLOUD: ${{ github.event.inputs.ci_cloud }}
37-
SOURCE_IMAGES_MAP: |
38-
{
39-
"RL8": {
40-
"openstack.openhpc": "rocky-latest-RL8",
41-
"openstack.openhpc-cuda": "rocky-latest-cuda-RL8"
42-
},
43-
"RL9": {
44-
"openstack.openhpc": "rocky-latest-RL9",
45-
"openstack.openhpc-cuda": "rocky-latest-cuda-RL9"
46-
}
47-
}
35+
ARK_PASSWORD: ${{ secrets.ARK_PASSWORD }}
36+
LEAFCLOUD_PULP_PASSWORD: ${{ secrets.LEAFCLOUD_PULP_PASSWORD }}
4837

4938
steps:
5039
- uses: actions/checkout@v2
@@ -90,13 +79,11 @@ jobs:
9079
9180
PACKER_LOG=1 packer build \
9281
-on-error=${{ vars.PACKER_ON_ERROR }} \
93-
-only=${{ matrix.build }} \
9482
-var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \
95-
-var "source_image_name=${{ env.SOURCE_IMAGE }}" \
83+
-var "source_image_name=${{ matrix.build.source_image_name }}" \
84+
-var "image_name=${{ matrix.build.image_name }}" \
85+
-var "inventory_groups=${{ matrix.build.inventory_groups }}" \
9686
openstack.pkr.hcl
97-
env:
98-
PKR_VAR_os_version: ${{ matrix.os_version }}
99-
SOURCE_IMAGE: ${{ fromJSON(env.SOURCE_IMAGES_MAP)[matrix.os_version][matrix.build] }}
10087
10188
- name: Get created image names from manifest
10289
id: manifest
@@ -107,14 +94,21 @@ jobs:
10794
sleep 5
10895
done
10996
IMAGE_NAME=$(openstack image show -f value -c name $IMAGE_ID)
97+
echo "image-name=${IMAGE_NAME}" >> "$GITHUB_OUTPUT"
98+
echo "image-id=$IMAGE_ID" >> "$GITHUB_OUTPUT"
11099
echo $IMAGE_ID > image-id.txt
111100
echo $IMAGE_NAME > image-name.txt
112101
102+
- name: Make image usable for further builds
103+
run: |
104+
. venv/bin/activate
105+
openstack image unset --property signature_verified "${{ steps.manifest.outputs.image-id }}"
106+
113107
- name: Upload manifest artifact
114108
uses: actions/upload-artifact@v4
115109
with:
116-
name: image-details-${{ matrix.build }}-${{ matrix.os_version }}
110+
name: image-details-${{ matrix.build.image_name }}
117111
path: |
118112
./image-id.txt
119113
./image-name.txt
120-
overwrite: true
114+
overwrite: true

.github/workflows/nightly-cleanup.yml

Lines changed: 43 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,8 @@
11
name: Cleanup CI clusters
22
on:
33
workflow_dispatch:
4-
inputs:
5-
ci_cloud:
6-
description: 'Select the CI_CLOUD'
7-
required: true
8-
type: choice
9-
options:
10-
- LEAFCLOUD
11-
- SMS
12-
- ARCUS
134
schedule:
14-
- cron: '0 20 * * *' # Run at 8PM - image sync runs at midnight
5+
- cron: '0 21 * * *' # Run at 9PM - image sync runs at midnight
156

167
jobs:
178
ci_cleanup:
@@ -52,20 +43,55 @@ jobs:
5243
- name: Find CI clusters
5344
run: |
5445
. venv/bin/activate
55-
CI_CLUSTERS=$(openstack server list | grep --only-matching 'slurmci-RL.-[0-9]\+' | sort | uniq)
56-
echo "ci_clusters=${CI_CLUSTERS}" >> GITHUB_ENV
46+
CI_CLUSTERS=$(openstack server list | grep --only-matching 'slurmci-RL.-[0-9]\+' | sort | uniq || true)
47+
echo "DEBUG: Raw CI clusters: $CI_CLUSTERS"
48+
49+
if [[ -z "$CI_CLUSTERS" ]]; then
50+
echo "No matching CI clusters found."
51+
else
52+
# Flatten multiline value so can be passed as env var
53+
CI_CLUSTERS_FORMATTED=$(echo "$CI_CLUSTERS" | tr '\n' ' ' | sed 's/ $//')
54+
echo "DEBUG: Formatted CI clusters: $CI_CLUSTERS_FORMATTED"
55+
echo "ci_clusters=$CI_CLUSTERS_FORMATTED" >> $GITHUB_ENV
56+
fi
5757
shell: bash
5858

5959
- name: Delete clusters if control node not tagged with keep
6060
run: |
6161
. venv/bin/activate
62-
for cluster_prefix in ${CI_CLUSTERS}
62+
if [[ -z ${ci_clusters} ]]; then
63+
echo "No clusters to delete."
64+
exit 0
65+
fi
66+
67+
for cluster_prefix in ${ci_clusters}
6368
do
64-
TAGS=$(openstack server show ${cluster_prefix}-control --column tags --format value)
65-
if [[ $TAGS =~ "keep" ]]; then
66-
echo "Skipping ${cluster_prefix} - control instance is tagged as keep"
69+
echo "Processing cluster: $cluster_prefix"
70+
# Get all servers with the matching name for control node
71+
CONTROL_SERVERS=$(openstack server list --name ${cluster_prefix}-control --format json)
72+
SERVER_COUNT=$(echo "$CONTROL_SERVERS" | jq length)
73+
74+
if [[ $SERVER_COUNT -gt 1 ]]; then
75+
echo "Multiple servers found for control node '${cluster_prefix}-control'. Checking tags for each..."
76+
77+
for server in $(echo "$CONTROL_SERVERS" | jq -r '.[].ID'); do
78+
# Get tags for each control node
79+
TAGS=$(openstack server show "$server" --column tags --format value)
80+
81+
if [[ $TAGS =~ "keep" ]]; then
82+
echo "Skipping ${cluster_prefix} (server ${server}) - control instance is tagged as keep"
83+
else
84+
./dev/delete-cluster.py ${cluster_prefix} --force
85+
fi
86+
done
6787
else
68-
yes | ./dev/delete-cluster.py ${cluster_prefix}
88+
# If only one server, extract its tags and proceed
89+
TAGS=$(echo "$CONTROL_SERVERS" | jq -r '.[0].Tags')
90+
if [[ $TAGS =~ "keep" ]]; then
91+
echo "Skipping ${cluster_prefix} - control instance is tagged as keep"
92+
else
93+
./dev/delete-cluster.py ${cluster_prefix} --force
94+
fi
6995
fi
7096
done
7197
shell: bash

0 commit comments

Comments
 (0)