Skip to content

Commit 97057ff

Browse files
Mohamed HabibMohamed Habib
authored andcommitted
Bake burst worker AMI dependencies
1 parent f69aa39 commit 97057ff

4 files changed

Lines changed: 110 additions & 40 deletions

File tree

.github/workflows/build-aws-worker-ami.yml

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ jobs:
8989
run: tar czf /tmp/packer-vector-ctx.tar.gz -C deploy vector
9090

9191
- name: Packer init
92-
run: packer init deploy/packer/worker-ami-aws.pkr.hcl
92+
run: packer init deploy/packer/worker-ami-aws-burst.pkr.hcl
9393

9494
- name: Build and publish AMI
9595
run: |
@@ -98,7 +98,11 @@ jobs:
9898
-var "agent_version=$AGENT_VERSION" \
9999
-var "region=$AWS_REGION" \
100100
-var "instance_type=$BUILDER_INSTANCE_TYPE" \
101-
deploy/packer/worker-ami-aws.pkr.hcl | tee /tmp/packer-output.txt
101+
-var "tigris_endpoint=${{ secrets.TIGRIS_ENDPOINT }}" \
102+
-var "tigris_access_key_id=${{ secrets.TIGRIS_ACCESS_KEY_ID }}" \
103+
-var "tigris_secret_access_key=${{ secrets.TIGRIS_SECRET_ACCESS_KEY }}" \
104+
-var "tigris_goldens_bucket=${{ secrets.TIGRIS_GOLDENS_BUCKET }}" \
105+
deploy/packer/worker-ami-aws-burst.pkr.hcl | tee /tmp/packer-output.txt
102106
103107
- name: Read AMI manifest
104108
id: ami
@@ -186,6 +190,7 @@ jobs:
186190
echo "- **Worker version:** \`$VERSION\`" >> "$GITHUB_STEP_SUMMARY"
187191
echo "- **Agent version:** \`$AGENT_VERSION\`" >> "$GITHUB_STEP_SUMMARY"
188192
echo "- **Golden version:** \`${GOLDEN_VERSION:-unknown}\`" >> "$GITHUB_STEP_SUMMARY"
193+
echo "- **Golden cache:** \`${{ secrets.TIGRIS_GOLDENS_BUCKET != '' && 'Tigris enabled' || 'disabled' }}\`" >> "$GITHUB_STEP_SUMMARY"
189194
echo "- **Region:** \`$AWS_REGION\`" >> "$GITHUB_STEP_SUMMARY"
190195
echo "- **SSM pointer:** \`$SSM_PARAMETER_PREFIX/worker-ami-id\`" >> "$GITHUB_STEP_SUMMARY"
191196
echo "" >> "$GITHUB_STEP_SUMMARY"

deploy/packer/worker-ami-aws.pkr.hcl renamed to deploy/packer/worker-ami-aws-burst.pkr.hcl

Lines changed: 82 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# worker-ami-aws.pkr.hcl — Build an immutable AMI for OpenSandbox workers (QEMU backend) on AWS.
1+
# worker-ami-aws-burst.pkr.hcl — Build an immutable AMI for OpenSandbox Burst workers on AWS.
22
#
33
# Mirrors deploy/packer/worker-ami.pkr.hcl (Azure variant) but targets the
44
# amazon-ebs builder. The setup script (`deploy/azure/setup-azure-host.sh`)
@@ -7,10 +7,9 @@
77
#
88
# Differences from the Azure file:
99
# - amazon-ebs source on Ubuntu 24.04 LTS x86_64 instead of azure-arm.
10-
# - No rootfs blob caching (the Azure variant's elaborate Azure-blob cache
11-
# dance was the only Azure-API touch; for the PoC we just rebuild the
12-
# rootfs each time, ~10min extra per bake — acceptable for low rebuild
13-
# frequency).
10+
# - Optional Tigris/S3-compatible rootfs blob caching. Same rootfs inputs
11+
# reuse the same cached default.ext4, which keeps AMI builds fast and
12+
# golden versions stable when the guest image did not change.
1413
# - Installs awscli (needed by deploy/vector/populate-vector-env.sh AWS path
1514
# and by the worker user-data shared-disk attach).
1615
# - Tags the AMI for the terraform `aws_ami` data source lookup
@@ -26,8 +25,8 @@
2625
# tar czf /tmp/packer-rootfs-ctx.tar.gz deploy/firecracker/rootfs/ deploy/ec2/build-rootfs-docker.sh scripts/claude-agent-wrapper/
2726
#
2827
# # 3. Run packer:
29-
# packer init deploy/packer/worker-ami-aws.pkr.hcl
30-
# packer build -var "worker_version=$(git rev-parse --short HEAD)" deploy/packer/worker-ami-aws.pkr.hcl
28+
# packer init deploy/packer/worker-ami-aws-burst.pkr.hcl
29+
# packer build -var "worker_version=$(git rev-parse --short HEAD)" deploy/packer/worker-ami-aws-burst.pkr.hcl
3130
#
3231
# # 4. The data source in opencomputer-infra/terraform/aws/us-east-2-poc/ami.tf
3332
# # picks up the new AMI on the next `tofu apply`.
@@ -88,10 +87,30 @@ variable "vector_context" {
8887
description = "Pre-built tarball of deploy/vector/ (config + populator + units). Pre-create with: tar czf /tmp/packer-vector-ctx.tar.gz deploy/vector/"
8988
}
9089

91-
variable "golden_cache_bucket" {
90+
variable "tigris_endpoint" {
9291
type = string
9392
default = ""
94-
description = "Optional S3 bucket to upload the bake's golden default.ext4 to (under bases/<golden_version>/). Cell-scoped — e.g. oc-aws-us-east-2-poc-golden-cache. Empty = skip upload."
93+
description = "Optional S3-compatible endpoint for Tigris rootfs/golden cache."
94+
}
95+
96+
variable "tigris_access_key_id" {
97+
type = string
98+
default = ""
99+
sensitive = true
100+
description = "Optional Tigris access key for rootfs/golden cache."
101+
}
102+
103+
variable "tigris_secret_access_key" {
104+
type = string
105+
default = ""
106+
sensitive = true
107+
description = "Optional Tigris secret key for rootfs/golden cache."
108+
}
109+
110+
variable "tigris_goldens_bucket" {
111+
type = string
112+
default = ""
113+
description = "Optional Tigris bucket for content-addressed rootfs cache and golden uploads. Empty = skip cache."
95114
}
96115

97116
# ---------------------------------------------------------------------
@@ -104,8 +123,8 @@ source "amazon-ebs" "worker" {
104123
ssh_username = "ubuntu"
105124
ssh_pty = true
106125

107-
ami_name = "opensandbox-worker-${var.worker_version}-${formatdate("YYYYMMDD-hhmm", timestamp())}"
108-
ami_description = "OpenSandbox worker AMI (Ubuntu 24.04, QEMU/KVM nested-virt). Built from git ${var.worker_version}."
126+
ami_name = "opensandbox-burst-worker-${var.worker_version}-${formatdate("YYYYMMDD-hhmm", timestamp())}"
127+
ami_description = "OpenSandbox Burst worker AMI (Ubuntu 24.04, QEMU/KVM nested-virt). Built from git ${var.worker_version}."
109128

110129
source_ami_filter {
111130
filters = {
@@ -131,7 +150,7 @@ source "amazon-ebs" "worker" {
131150
# AMI tags — the terraform `aws_ami` data source in the AWS leaf filters
132151
# on these to pick the most-recent worker AMI for this cloud.
133152
tags = {
134-
Name = "opensandbox-worker-${var.worker_version}"
153+
Name = "opensandbox-burst-worker-${var.worker_version}"
135154
"opensandbox-role" = "worker"
136155
"opensandbox-cloud" = "aws"
137156
"opensandbox-version" = var.worker_version
@@ -204,18 +223,30 @@ build {
204223
}
205224

206225
# 6. AWS-specific: install awscli (used by populate-vector-env.sh and by
207-
# the worker user-data's shared-disk attach), then install binaries and
208-
# build the golden rootfs.
226+
# the worker user-data's shared-disk attach), bake OCFS2 dependencies for
227+
# the shared data volume, then install binaries and build the golden rootfs.
209228
provisioner "shell" {
210229
execute_command = "chmod +x {{ .Path }}; {{ .Vars }} sudo -E bash '{{ .Path }}'"
230+
environment_vars = [
231+
"TIGRIS_ENDPOINT=${var.tigris_endpoint}",
232+
"TIGRIS_ACCESS_KEY_ID=${var.tigris_access_key_id}",
233+
"TIGRIS_SECRET_ACCESS_KEY=${var.tigris_secret_access_key}",
234+
"TIGRIS_GOLDENS_BUCKET=${var.tigris_goldens_bucket}",
235+
"AWS_DEFAULT_REGION=auto",
236+
]
211237
inline = [
212238
# awscli v2 — apt's `awscli` is v1 and missing some commands we use.
213239
"apt-get update -qq",
214-
"apt-get install -y -qq unzip",
240+
"DEBIAN_FRONTEND=noninteractive apt-get install -y -qq unzip ocfs2-tools \"linux-modules-extra-$(uname -r)\"",
215241
"curl -fsSL 'https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip' -o /tmp/awscliv2.zip",
216242
"cd /tmp && unzip -q awscliv2.zip && ./aws/install --update",
217243
"rm -rf /tmp/awscliv2.zip /tmp/aws",
218244
"aws --version",
245+
"modprobe ocfs2",
246+
"modprobe ocfs2_dlmfs",
247+
"modprobe ocfs2_stack_o2cb",
248+
"command -v mount.ocfs2",
249+
"systemctl disable --now apt-daily.timer apt-daily-upgrade.timer apt-daily.service apt-daily-upgrade.service 2>/dev/null || true",
219250

220251
# Install worker + agent binaries.
221252
"mv /tmp/opensandbox-worker /usr/local/bin/opensandbox-worker",
@@ -228,15 +259,33 @@ build {
228259
"systemctl daemon-reload",
229260
"systemctl enable opensandbox-worker.service",
230261

231-
# Build the golden rootfs (no caching for PoC — every bake builds from scratch).
262+
# Build or restore the golden rootfs. The cache key is content-addressed
263+
# from the guest agent, rootfs sources, and guest kernel modules.
232264
"mkdir -p /tmp/rootfs-ctx",
233265
"cd /tmp/rootfs-ctx && tar xzf /tmp/rootfs-ctx.tar.gz",
234266
"INPUT_HASH=$({ sha256sum /usr/local/bin/osb-agent; find /tmp/rootfs-ctx -type f | sort | xargs sha256sum; sha256sum /opt/opensandbox/guest-modules/*.ko* 2>/dev/null; } | sha256sum | awk '{print $1}')",
235267
"echo \"Rootfs input hash: $INPUT_HASH\"",
236268
"ROOTFS_UUID=$(echo \"$INPUT_HASH\" | head -c 32 | sed 's/\\(........\\)\\(....\\)\\(....\\)\\(....\\)\\(............\\)/\\1-\\2-\\3-\\4-\\5/')",
237269
"export ROOTFS_UUID",
270+
"INPUT_HASH_SHORT=$(echo \"$INPUT_HASH\" | cut -c1-16)",
271+
"CACHE_KEY=\"rootfs-cache/$INPUT_HASH_SHORT/default.ext4\"",
272+
"CACHE_HIT=0",
238273
"mkdir -p /data/firecracker/images /opt/opensandbox/images",
239-
"cd /tmp/rootfs-ctx && bash deploy/ec2/build-rootfs-docker.sh /usr/local/bin/osb-agent /data/firecracker/images default",
274+
"if [ -n \"$TIGRIS_ENDPOINT\" ] && [ -n \"$TIGRIS_ACCESS_KEY_ID\" ] && [ -n \"$TIGRIS_SECRET_ACCESS_KEY\" ] && [ -n \"$TIGRIS_GOLDENS_BUCKET\" ]; then",
275+
" export AWS_ACCESS_KEY_ID=\"$TIGRIS_ACCESS_KEY_ID\" AWS_SECRET_ACCESS_KEY=\"$TIGRIS_SECRET_ACCESS_KEY\"",
276+
" echo \"Checking rootfs cache: s3://$TIGRIS_GOLDENS_BUCKET/$CACHE_KEY\"",
277+
" if aws s3 cp --endpoint-url \"$TIGRIS_ENDPOINT\" \"s3://$TIGRIS_GOLDENS_BUCKET/$CACHE_KEY\" /data/firecracker/images/default.ext4; then",
278+
" CACHE_HIT=1",
279+
" echo 'Rootfs restored from cache — skipping Docker build'",
280+
" else",
281+
" echo 'Rootfs cache miss — building from source'",
282+
" fi",
283+
"else",
284+
" echo 'Tigris cache credentials incomplete; rootfs cache disabled'",
285+
"fi",
286+
"if [ \"$CACHE_HIT\" != \"1\" ]; then",
287+
" cd /tmp/rootfs-ctx && ROOTFS_UUID=\"$ROOTFS_UUID\" bash deploy/ec2/build-rootfs-docker.sh /usr/local/bin/osb-agent /data/firecracker/images default",
288+
"fi",
240289
"cp /data/firecracker/images/default.ext4 /opt/opensandbox/images/default.ext4",
241290

242291
# Inject guest kernel modules into rootfs.
@@ -255,31 +304,36 @@ build {
255304
"GOLDEN_VERSION=$(/usr/local/bin/opensandbox-worker golden-version /opt/opensandbox/images/default.ext4 2>/dev/null || sha256sum /opt/opensandbox/images/default.ext4 | awk '{print $1}')",
256305
"echo \"$GOLDEN_VERSION\" > /opt/opensandbox/images/golden-version",
257306
"echo \"Golden version: $GOLDEN_VERSION\"",
307+
"if [ \"$CACHE_HIT\" != \"1\" ] && [ -n \"$TIGRIS_ENDPOINT\" ] && [ -n \"$TIGRIS_ACCESS_KEY_ID\" ] && [ -n \"$TIGRIS_SECRET_ACCESS_KEY\" ] && [ -n \"$TIGRIS_GOLDENS_BUCKET\" ]; then",
308+
" export AWS_ACCESS_KEY_ID=\"$TIGRIS_ACCESS_KEY_ID\" AWS_SECRET_ACCESS_KEY=\"$TIGRIS_SECRET_ACCESS_KEY\"",
309+
" echo \"Uploading rootfs cache: s3://$TIGRIS_GOLDENS_BUCKET/$CACHE_KEY\"",
310+
" aws s3 cp --endpoint-url \"$TIGRIS_ENDPOINT\" /opt/opensandbox/images/default.ext4 \"s3://$TIGRIS_GOLDENS_BUCKET/$CACHE_KEY\" || echo 'rootfs cache upload failed — continuing'",
311+
"fi",
258312
]
259313
}
260314

261-
# 7. Optional: upload the golden to S3 so the cell's shared-disk seeder
315+
# 7. Optional: upload the golden to Tigris so future hydration paths
262316
# + future per-instance prefetch path can fetch it without rebuilding.
263317
provisioner "shell" {
264318
execute_command = "chmod +x {{ .Path }}; {{ .Vars }} sudo -E bash '{{ .Path }}'"
265319
environment_vars = [
266-
"GOLDEN_CACHE_BUCKET=${var.golden_cache_bucket}",
267-
"AWS_DEFAULT_REGION=${var.region}",
320+
"TIGRIS_ENDPOINT=${var.tigris_endpoint}",
321+
"TIGRIS_ACCESS_KEY_ID=${var.tigris_access_key_id}",
322+
"TIGRIS_SECRET_ACCESS_KEY=${var.tigris_secret_access_key}",
323+
"TIGRIS_GOLDENS_BUCKET=${var.tigris_goldens_bucket}",
324+
"AWS_DEFAULT_REGION=auto",
268325
]
269326
inline = [
270327
"set -e",
271-
"if [ -z \"$GOLDEN_CACHE_BUCKET\" ]; then",
272-
" echo 'No golden_cache_bucket set; skipping S3 upload (worker AMI still includes the baked golden)'",
328+
"if [ -z \"$TIGRIS_ENDPOINT\" ] || [ -z \"$TIGRIS_ACCESS_KEY_ID\" ] || [ -z \"$TIGRIS_SECRET_ACCESS_KEY\" ] || [ -z \"$TIGRIS_GOLDENS_BUCKET\" ]; then",
329+
" echo 'Tigris cache credentials incomplete; skipping golden upload (worker AMI still includes the baked golden)'",
273330
" exit 0",
274331
"fi",
332+
"export AWS_ACCESS_KEY_ID=\"$TIGRIS_ACCESS_KEY_ID\" AWS_SECRET_ACCESS_KEY=\"$TIGRIS_SECRET_ACCESS_KEY\"",
275333
"GOLDEN_VERSION=$(cat /opt/opensandbox/images/golden-version)",
276334
"S3_KEY=\"bases/$GOLDEN_VERSION/default.ext4\"",
277-
"echo \"Uploading default.ext4 → s3://$GOLDEN_CACHE_BUCKET/$S3_KEY (~4GB, will take a moment)\"",
278-
# Instance profile credentials — the bake runs on an EC2 instance and
279-
# picks up its role via the metadata service. If the builder role
280-
# doesn't have s3:PutObject on the cell's bucket, the upload fails
281-
# gracefully and the AMI still works (just without S3-side hydration).
282-
"aws s3 cp /opt/opensandbox/images/default.ext4 \"s3://$GOLDEN_CACHE_BUCKET/$S3_KEY\" || echo 'S3 upload failed — continuing (AMI golden is the only copy)'",
335+
"echo \"Uploading default.ext4 -> s3://$TIGRIS_GOLDENS_BUCKET/$S3_KEY (~4GB, will take a moment)\"",
336+
"aws s3 cp --endpoint-url \"$TIGRIS_ENDPOINT\" /opt/opensandbox/images/default.ext4 \"s3://$TIGRIS_GOLDENS_BUCKET/$S3_KEY\" || echo 'Tigris upload failed — continuing (AMI golden is the only copy)'",
283337
]
284338
}
285339

deploy/vector/populate-vector-env.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ aws)
174174
exit 0
175175
fi
176176
if ! command -v aws >/dev/null 2>&1; then
177-
log "aws CLI not installed in AMI — populator can't fetch from Secrets Manager. Bake awscli into the worker image (see deploy/packer/worker-ami-aws.pkr.hcl)."
177+
log "aws CLI not installed in AMI — populator can't fetch from Secrets Manager. Bake awscli into the worker image (see deploy/packer/worker-ami-aws-burst.pkr.hcl)."
178178
exit 0
179179
fi
180180
# Auto-detect region from IMDSv2 so we don't have to plumb it via env.

0 commit comments

Comments
 (0)