Skip to content
This repository was archived by the owner on Apr 30, 2026. It is now read-only.

Commit 7cffa47

Browse files
authored
Merge pull request #661 from instructlab/mergify/bp/release-v0.8/pr-631
Add the fallback logic to our e2e-large job (backport #631)
2 parents caab1de + bacacd4 commit 7cffa47

1 file changed

Lines changed: 54 additions & 36 deletions

File tree

.github/workflows/e2e-nvidia-l40s-x4.yml

Lines changed: 54 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -19,28 +19,59 @@ jobs:
1919
start-large-ec2-runner:
2020
runs-on: ubuntu-latest
2121
outputs:
22-
label: ${{ steps.start-ec2-runner.outputs.label }}
23-
ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
22+
label: ${{ steps.launch-ec2-instance-with-fallback.outputs.label }}
23+
ec2-instance-id: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-id }}
24+
ec2-instance-region: ${{ steps.launch-ec2-instance-with-fallback.outputs.ec2-instance-region }}
2425
steps:
25-
- name: Configure AWS credentials
26-
uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
26+
- name: Checkout "launch-ec2-runner-with-fallback" in-house CI action
27+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
2728
with:
28-
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
29-
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
30-
aws-region: ${{ vars.AWS_REGION }}
31-
32-
- name: Start EC2 runner
33-
id: start-ec2-runner
34-
uses: machulav/ec2-github-runner@a8c20fc0876503410b2b966c124abc2311984ce2 # v2.3.9
29+
repository: instructlab/ci-actions
30+
# clone the "ci-actions" repo to a local directory called "ci-actions", instead of
31+
# overwriting the current WORKDIR contents
32+
path: ci-actions
33+
ref: release-v0.1
34+
sparse-checkout: |
35+
actions/launch-ec2-runner-with-fallback
36+
37+
- name: Launch EC2 Runner with Fallback
38+
id: launch-ec2-instance-with-fallback
39+
uses: ./ci-actions/actions/launch-ec2-runner-with-fallback
40+
env:
41+
TMPDIR: "/tmp"
3542
with:
36-
mode: start
37-
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
38-
ec2-image-id: ${{ vars.AWS_EC2_AMI }}
39-
ec2-instance-type: g6e.12xlarge
40-
subnet-id: subnet-024298cefa3bedd61
41-
security-group-id: sg-06300447c4a5fbef3
42-
iam-role-name: instructlab-ci-runner
43-
aws-resource-tags: >
43+
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
44+
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
45+
github_token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
46+
regions_config: >
47+
[
48+
{
49+
"region": "us-east-2",
50+
"subnets": {
51+
"us-east-2a": "${{ vars.SUBNET_US_EAST_2A }}",
52+
"us-east-2b": "${{ vars.SUBNET_US_EAST_2B }}",
53+
"us-east-2c": "${{ vars.SUBNET_US_EAST_2C }}"
54+
},
55+
"ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_2 }}",
56+
"security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_2 }}"
57+
},
58+
{
59+
"region": "us-east-1",
60+
"subnets": {
61+
"us-east-1a": "${{ vars.SUBNET_US_EAST_1A }}",
62+
"us-east-1b": "${{ vars.SUBNET_US_EAST_1B }}",
63+
"us-east-1c": "${{ vars.SUBNET_US_EAST_1C }}",
64+
"us-east-1d": "${{ vars.SUBNET_US_EAST_1D }}",
65+
"us-east-1e": "${{ vars.SUBNET_US_EAST_1E }}",
66+
"us-east-1f": "${{ vars.SUBNET_US_EAST_1F }}"
67+
},
68+
"ec2-ami": "${{ vars.AWS_EC2_AMI_US_EAST_1 }}",
69+
"security-group-id": "${{ vars.SECURITY_GROUP_ID_US_EAST_1 }}"
70+
}
71+
]
72+
try_spot_instance_first: false
73+
ec2_instance_type: g6e.12xlarge
74+
aws_resource_tags: >
4475
[
4576
{"Key": "Name", "Value": "instructlab-ci-github-large-runner"},
4677
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
@@ -60,8 +91,8 @@ jobs:
6091
- name: Install Packages
6192
run: |
6293
cat /etc/os-release
63-
mkdir -p "${TMPDIR}"
64-
sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel
94+
mkdir -p /home/tmp
95+
sudo dnf install -y gcc gcc-c++ make git-core python3.11 python3.11-devel
6596
6697
- name: Checkout instructlab/instructlab
6798
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@@ -144,20 +175,7 @@ jobs:
144175
- name: Install ilab
145176
working-directory: ./instructlab
146177
run: |
147-
export CUDA_HOME="/usr/local/cuda"
148-
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
149-
export PATH="$PATH:$CUDA_HOME/bin"
150-
python3.11 -m venv --upgrade-deps venv
151-
. venv/bin/activate
152-
nvidia-smi
153-
python3.11 -m pip cache remove llama_cpp_python
154-
155-
CMAKE_ARGS="-DGGML_CUDA=on" python3.11 -m pip install -v . -c constraints-dev.txt
156-
157-
# https://github.com/instructlab/instructlab/issues/1821
158-
# install with Torch and build dependencies installed
159-
python3.11 -m pip install -v packaging wheel setuptools-scm
160-
python3.11 -m pip install -v .[cuda] -r requirements-vllm-cuda.txt -c constraints-dev.txt
178+
PYTHON="python3.11" ./scripts/install-ilab-with-cuda.sh
161179
162180
- name: Update instructlab-sdg library
163181
working-directory: ./sdg
@@ -263,7 +281,7 @@ jobs:
263281
with:
264282
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
265283
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
266-
aws-region: ${{ vars.AWS_REGION }}
284+
aws-region: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-region }}
267285

268286
- name: Stop EC2 runner
269287
uses: machulav/ec2-github-runner@a8c20fc0876503410b2b966c124abc2311984ce2 # v2.3.9

0 commit comments

Comments
 (0)