Skip to content

Commit cb10a6a

Browse files
Mohamed HabibMohamed Habib
authored andcommitted
Fix AWS burst worker bootstrap storage
1 parent 344f7a6 commit cb10a6a

1 file changed

Lines changed: 34 additions & 13 deletions

File tree

internal/compute/ec2.go

Lines changed: 34 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -470,6 +470,9 @@ func (p *EC2Pool) buildUserData(opts MachineOpts) string {
470470
sb.WriteString("systemctl stop opensandbox-worker.service 2>/dev/null || true\n")
471471
sb.WriteString("systemctl disable opensandbox-worker.service 2>/dev/null || true\n")
472472
sb.WriteString("systemctl reset-failed opensandbox-worker.service 2>/dev/null || true\n\n")
473+
sb.WriteString("systemctl stop opensandbox-server.service 2>/dev/null || true\n")
474+
sb.WriteString("systemctl disable opensandbox-server.service 2>/dev/null || true\n")
475+
sb.WriteString("systemctl reset-failed opensandbox-server.service 2>/dev/null || true\n\n")
473476

474477
sb.WriteString("# Instance identity from EC2 metadata (IMDSv2)\n")
475478
sb.WriteString("TOKEN=$(curl -fsS -X PUT 'http://169.254.169.254/latest/api/token' -H 'X-aws-ec2-metadata-token-ttl-seconds: 300')\n")
@@ -480,26 +483,25 @@ func (p *EC2Pool) buildUserData(opts MachineOpts) string {
480483
// NVMe instance store handling. Larger metal/x.gd instance families expose
481484
// multiple NVMe drives at /dev/nvme[1-N]n1; smaller instances rely on EBS
482485
// (the attached data volume). RAID 0 across instance store NVMe when present.
483-
sb.WriteString("# Mount data: prefer NVMe instance store (RAID 0), else first EBS data volume\n")
486+
sb.WriteString("# Mount data: prefer EC2 instance-store NVMe (RAID 0). Otherwise use root fs for /data.\n")
484487
sb.WriteString("if ! mountpoint -q /data 2>/dev/null; then\n")
485488
sb.WriteString(" mkdir -p /data\n")
486489
sb.WriteString(" ROOT_DEV=$(lsblk -no PKNAME $(findmnt -n -o SOURCE /) 2>/dev/null | head -1)\n")
487490
sb.WriteString(" NVME_DISKS=()\n")
488-
sb.WriteString(" for d in /dev/nvme1n1 /dev/nvme2n1 /dev/nvme3n1 /dev/nvme4n1 /dev/nvme5n1; do\n")
489-
sb.WriteString(" [ -b \"$d\" ] || continue\n")
490-
sb.WriteString(" [ \"$(basename $d)\" = \"$ROOT_DEV\" ] && continue\n")
491-
sb.WriteString(" NVME_DISKS+=(\"$d\")\n")
492-
sb.WriteString(" done\n")
491+
sb.WriteString(" while read -r name model; do\n")
492+
sb.WriteString(" [ -n \"${name:-}\" ] || continue\n")
493+
sb.WriteString(" [ \"$name\" = \"$ROOT_DEV\" ] && continue\n")
494+
sb.WriteString(" [ \"$model\" = \"Amazon EC2 NVMe Instance Storage\" ] || continue\n")
495+
sb.WriteString(" NVME_DISKS+=(\"/dev/$name\")\n")
496+
sb.WriteString(" done < <(lsblk -dn -o NAME,MODEL)\n")
497+
sb.WriteString(" if [ ${#NVME_DISKS[@]} -eq 0 ]; then\n")
498+
sb.WriteString(" echo 'No EC2 instance-store NVMe found; using root filesystem for /data'\n")
499+
sb.WriteString(" fi\n")
493500
sb.WriteString(" if [ ${#NVME_DISKS[@]} -gt 1 ]; then\n")
494501
sb.WriteString(" mdadm --create /dev/md0 --level=0 --raid-devices=${#NVME_DISKS[@]} \"${NVME_DISKS[@]}\" --run --force\n")
495502
sb.WriteString(" mkfs.xfs -f -m reflink=1 /dev/md0 && mount /dev/md0 /data\n")
496503
sb.WriteString(" elif [ ${#NVME_DISKS[@]} -eq 1 ]; then\n")
497504
sb.WriteString(" mkfs.xfs -f -m reflink=1 \"${NVME_DISKS[0]}\" && mount \"${NVME_DISKS[0]}\" /data\n")
498-
sb.WriteString(" else\n")
499-
sb.WriteString(" for d in /dev/nvme1n1 /dev/sdb /dev/xvdb; do\n")
500-
sb.WriteString(" [ -b \"$d\" ] || continue\n")
501-
sb.WriteString(" mkfs.xfs -f -m reflink=1 \"$d\" && mount \"$d\" /data && break\n")
502-
sb.WriteString(" done\n")
503505
sb.WriteString(" fi\n")
504506
sb.WriteString("fi\n")
505507
sb.WriteString("mkdir -p /data/sandboxes /data/firecracker/images\n")
@@ -591,6 +593,9 @@ func (p *EC2Pool) sharedSandboxDataUserData() string {
591593
sb.WriteString(" sleep 1\n")
592594
sb.WriteString("done\n")
593595
sb.WriteString("if [ -z \"${SANDBOX_DEV:-}\" ]; then echo \"ERROR: shared sandbox data volume not attached\"; lsblk -o NAME,MODEL,SERIAL,SIZE,FSTYPE,MOUNTPOINT || true; exit 1; fi\n")
596+
sb.WriteString("SANDBOX_SERIAL=$(lsblk -dn -o SERIAL \"$SANDBOX_DEV\" 2>/dev/null | head -1 || true)\n")
597+
sb.WriteString("if [ \"$SANDBOX_SERIAL\" != \"$SANDBOX_VOL_NO_DASH\" ]; then echo \"ERROR: $SANDBOX_DEV serial $SANDBOX_SERIAL does not match sandbox volume $SANDBOX_VOLUME_ID\"; lsblk -o NAME,MODEL,SERIAL,SIZE,FSTYPE,MOUNTPOINT || true; exit 1; fi\n")
598+
sb.WriteString("echo \"Using shared sandbox data volume $SANDBOX_VOLUME_ID at $SANDBOX_DEV\"\n")
594599
sb.WriteString("mapfile -t OCFS2_NODES < <(for i in $(seq 1 60); do aws ec2 describe-instances --region " + shellQuote(p.cfg.Region) + " --filters \"Name=tag:Cell,Values=" + shellEscapedDouble(p.cfg.CellID) + "\" \"Name=tag:Role,Values=worker\" \"Name=instance-state-name,Values=running\" --query 'Reservations[].Instances[].PrivateDnsName' --output text | tr '\\t' '\\n' | awk 'NF { sub(/\\..*/, \"\", $0); print }' | sort -u; break; done)\n")
595600
sb.WriteString("for i in $(seq 1 60); do\n")
596601
sb.WriteString(" [ \"${#OCFS2_NODES[@]}\" -ge \"$OCFS2_EXPECTED_NODES\" ] && break\n")
@@ -608,8 +613,10 @@ func (p *EC2Pool) sharedSandboxDataUserData() string {
608613
sb.WriteString("mkdir -p /data/sandboxes\n")
609614
sb.WriteString("FSTYPE=$(blkid -s TYPE -o value \"$SANDBOX_DEV\" 2>/dev/null || true)\n")
610615
sb.WriteString("if [ -z \"$FSTYPE\" ]; then mkfs.ocfs2 -F -N \"$OCFS2_MAX_NODES\" -L opensandbox-sandboxes -T vmstore \"$SANDBOX_DEV\"; fi\n")
616+
sb.WriteString("FSTYPE=$(blkid -s TYPE -o value \"$SANDBOX_DEV\" 2>/dev/null || true)\n")
617+
sb.WriteString("if [ \"$FSTYPE\" != \"ocfs2\" ]; then echo \"ERROR: shared sandbox data volume $SANDBOX_DEV has filesystem '$FSTYPE', expected ocfs2\"; lsblk -o NAME,MODEL,SERIAL,SIZE,FSTYPE,MOUNTPOINT || true; exit 1; fi\n")
611618
sb.WriteString("if ! grep -q 'LABEL=opensandbox-sandboxes' /etc/fstab; then echo 'LABEL=opensandbox-sandboxes /data/sandboxes ocfs2 noauto,_netdev,noatime 0 0' >> /etc/fstab; fi\n")
612-
sb.WriteString("timeout 90 mount -t ocfs2 -o noatime LABEL=opensandbox-sandboxes /data/sandboxes\n")
619+
sb.WriteString("timeout 90 mount -t ocfs2 -o noatime \"$SANDBOX_DEV\" /data/sandboxes\n")
613620
sb.WriteString("chown root:root /data/sandboxes\n\n")
614621
return sb.String()
615622
}
@@ -624,10 +631,24 @@ func (p *EC2Pool) sharedGoldensUserData() string {
624631
sb.WriteString("GOLDENS_VOL_NO_DASH=\"${GOLDENS_VOLUME_ID//-/}\"\n")
625632
sb.WriteString("for i in $(seq 1 120); do\n")
626633
sb.WriteString(" if [ -e \"/dev/disk/by-id/nvme-Amazon_Elastic_Block_Store_${GOLDENS_VOL_NO_DASH}\" ]; then GOLDENS_DEV=$(readlink -f \"/dev/disk/by-id/nvme-Amazon_Elastic_Block_Store_${GOLDENS_VOL_NO_DASH}\"); fi\n")
634+
sb.WriteString(" if [ -z \"${GOLDENS_DEV:-}\" ] && [ -e \"/dev/disk/by-id/nvme-Amazon_Elastic_Block_Store_${GOLDENS_VOL_NO_DASH}_1\" ]; then GOLDENS_DEV=$(readlink -f \"/dev/disk/by-id/nvme-Amazon_Elastic_Block_Store_${GOLDENS_VOL_NO_DASH}_1\"); fi\n")
635+
sb.WriteString(" if [ -z \"${GOLDENS_DEV:-}\" ]; then GOLDENS_DEV=$(lsblk -dn -o NAME,SERIAL | awk -v v=\"$GOLDENS_VOL_NO_DASH\" '$2 == v {print \"/dev/\"$1; exit}'); fi\n")
627636
sb.WriteString(" [ -n \"${GOLDENS_DEV:-}\" ] && break\n")
628637
sb.WriteString(" sleep 1\n")
629638
sb.WriteString("done\n")
630-
sb.WriteString("if [ -n \"${GOLDENS_DEV:-}\" ]; then mount -o ro,noload \"$GOLDENS_DEV\" /opt/opensandbox/goldens-shared || true; fi\n")
639+
sb.WriteString("if [ -n \"${GOLDENS_DEV:-}\" ]; then\n")
640+
sb.WriteString(" GOLDENS_SERIAL=$(lsblk -dn -o SERIAL \"$GOLDENS_DEV\" 2>/dev/null | head -1 || true)\n")
641+
sb.WriteString(" if [ \"$GOLDENS_SERIAL\" != \"$GOLDENS_VOL_NO_DASH\" ]; then echo \"WARN: $GOLDENS_DEV serial $GOLDENS_SERIAL does not match golden volume $GOLDENS_VOLUME_ID\"; GOLDENS_DEV=\"\"; fi\n")
642+
sb.WriteString("fi\n")
643+
sb.WriteString("if [ -n \"${GOLDENS_DEV:-}\" ]; then\n")
644+
sb.WriteString(" GOLDENS_FSTYPE=$(blkid -s TYPE -o value \"$GOLDENS_DEV\" 2>/dev/null || true)\n")
645+
sb.WriteString(" case \"$GOLDENS_FSTYPE\" in\n")
646+
sb.WriteString(" ext2|ext3|ext4) mount -t \"$GOLDENS_FSTYPE\" -o ro,noload,noatime \"$GOLDENS_DEV\" /opt/opensandbox/goldens-shared || true ;;\n")
647+
sb.WriteString(" xfs) mount -t xfs -o ro,noatime \"$GOLDENS_DEV\" /opt/opensandbox/goldens-shared || true ;;\n")
648+
sb.WriteString(" '') echo \"WARN: shared golden volume $GOLDENS_VOLUME_ID has no filesystem; continuing without it\" ;;\n")
649+
sb.WriteString(" *) echo \"WARN: shared golden volume $GOLDENS_VOLUME_ID has unsupported filesystem '$GOLDENS_FSTYPE'; continuing without it\" ;;\n")
650+
sb.WriteString(" esac\n")
651+
sb.WriteString("fi\n")
631652
sb.WriteString("if [ -d /opt/opensandbox/goldens-shared/golden ]; then ln -sfn /opt/opensandbox/goldens-shared/golden /var/lib/opensandbox/golden; fi\n\n")
632653
return sb.String()
633654
}

0 commit comments

Comments
 (0)