Skip to content

Commit 2c28271

Browse files
Merge pull request #2849 from bunkerity/dev
feat: Improve SSH configuration with enhanced keepalive and timeout settings for long-running builds
2 parents 166707d + cad0438 commit 2c28271

4 files changed

Lines changed: 37 additions & 371 deletions

File tree

β€Ž.github/workflows/container-build.ymlβ€Ž

Lines changed: 9 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -59,107 +59,22 @@ jobs:
5959
echo "$SSH_KEY" > ~/.ssh/id_rsa_arm
6060
chmod 600 ~/.ssh/id_rsa_arm
6161
echo "$SSH_CONFIG" | sed "s/SSH_IP/$SSH_IP/g" > ~/.ssh/config
62-
# Enhanced keepalive and timeout settings
62+
# Enhanced keepalive and timeout settings for long-running builds
6363
cat >> ~/.ssh/config << 'EOF'
64-
ServerAliveInterval 30
65-
ServerAliveCountMax 3
64+
ServerAliveInterval 15
65+
ServerAliveCountMax 20
6666
TCPKeepAlive yes
67-
ConnectTimeout 10
68-
ConnectionAttempts 1
67+
ConnectTimeout 30
68+
ConnectionAttempts 3
69+
ControlMaster auto
70+
ControlPath ~/.ssh/control-%C
71+
ControlPersist 1h
72+
StrictHostKeyChecking no
6973
EOF
7074
env:
7175
SSH_KEY: ${{ secrets.ARM_SSH_KEY }}
7276
SSH_IP: ${{ secrets.ARM_SSH_IP }}
7377
SSH_CONFIG: ${{ secrets.ARM_SSH_CONFIG }}
74-
- name: Check ARM SSH connectivity
75-
if: startsWith(inputs.CACHE_SUFFIX, 'arm')
76-
run: |
77-
echo "=== Starting enhanced SSH connectivity check ==="
78-
SSH_HOST="root@$SSH_IP"
79-
MAX_RETRIES=10
80-
RETRY_COUNT=0
81-
SUCCESS=false
82-
83-
# Enhanced SSH options for stability
84-
SSH_OPTS="-o StrictHostKeyChecking=no \
85-
-o ConnectTimeout=10 \
86-
-o ConnectionAttempts=1 \
87-
-o ServerAliveInterval=30 \
88-
-o ServerAliveCountMax=3 \
89-
-o TCPKeepAlive=yes \
90-
-o BatchMode=yes \
91-
-i ~/.ssh/id_rsa_arm"
92-
93-
while [ $RETRY_COUNT -lt $MAX_RETRIES ]; do
94-
RETRY_COUNT=$((RETRY_COUNT + 1))
95-
echo "----------------------------------------"
96-
echo "Attempt $RETRY_COUNT of $MAX_RETRIES"
97-
echo "Timestamp: $(date '+%Y-%m-%d %H:%M:%S')"
98-
99-
# First check if SSH port is reachable
100-
echo "β†’ Checking TCP port 22..."
101-
if timeout 5 bash -c "cat < /dev/null > /dev/tcp/$SSH_IP/22" 2>/dev/null; then
102-
echo "βœ“ Port 22 is open"
103-
104-
# Try SSH connection with comprehensive checks
105-
echo "β†’ Testing SSH connection..."
106-
if ssh $SSH_OPTS $SSH_HOST "echo 'βœ“ SSH connection established' && \
107-
echo 'β†’ Checking Docker...' && \
108-
docker info > /dev/null 2>&1 && \
109-
echo 'βœ“ Docker is running' && \
110-
echo 'β†’ Checking network...' && \
111-
ping -c 1 9.9.9.9 > /dev/null 2>&1 && \
112-
echo 'βœ“ Network connectivity verified (Quad9)' && \
113-
echo 'β†’ System info:' && \
114-
uptime && \
115-
free -h | head -n 2" 2>&1; then
116-
echo "βœ“βœ“βœ“ All connectivity checks passed! βœ“βœ“βœ“"
117-
SUCCESS=true
118-
break
119-
else
120-
echo "βœ— SSH command execution failed"
121-
fi
122-
else
123-
echo "βœ— Port 22 is not reachable"
124-
fi
125-
126-
# Calculate backoff delay (exponential: 5, 10, 15, 20, 30, 30, ...)
127-
if [ $RETRY_COUNT -lt $MAX_RETRIES ]; then
128-
DELAY=$((5 * RETRY_COUNT))
129-
[ $DELAY -gt 30 ] && DELAY=30
130-
echo "⏳ Waiting ${DELAY}s before retry..."
131-
sleep $DELAY
132-
fi
133-
done
134-
135-
if [ "$SUCCESS" = false ]; then
136-
echo "========================================="
137-
echo "βœ—βœ—βœ— FATAL: All $MAX_RETRIES connection attempts failed βœ—βœ—βœ—"
138-
echo "This likely indicates:"
139-
echo " - ARM node is not accessible"
140-
echo " - Network connectivity issues"
141-
echo " - SSH service not running"
142-
echo " - Docker service not running"
143-
echo "========================================="
144-
exit 1
145-
fi
146-
147-
# Final stabilization period
148-
echo "========================================="
149-
echo "⏳ Allowing connection to stabilize (10s)..."
150-
sleep 10
151-
152-
# Final verification
153-
echo "β†’ Final connectivity verification..."
154-
if ssh $SSH_OPTS $SSH_HOST "echo 'βœ“ Final check: SSH OK' && docker ps > /dev/null 2>&1 && echo 'βœ“ Final check: Docker OK'"; then
155-
echo "βœ“βœ“βœ“ SSH connectivity fully stable and ready βœ“βœ“βœ“"
156-
echo "========================================="
157-
else
158-
echo "βœ—βœ—βœ— FATAL: Final verification failed βœ—βœ—βœ—"
159-
exit 1
160-
fi
161-
env:
162-
SSH_IP: ${{ secrets.ARM_SSH_IP }}
16378
- name: Setup Buildx
16479
uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # v3.11.1
16580
if: startsWith(inputs.CACHE_SUFFIX, 'arm') == false

β€Ž.github/workflows/create-arm.ymlβ€Ž

Lines changed: 10 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -76,103 +76,23 @@ jobs:
7676
echo "$SSH_KEY" > ~/.ssh/id_rsa_arm
7777
chmod 600 ~/.ssh/id_rsa_arm
7878
echo "$SSH_CONFIG" | sed "s/SSH_IP/$SSH_IP/g" > ~/.ssh/config
79-
# Enhanced keepalive and timeout settings
79+
# Enhanced keepalive and timeout settings for long-running builds
8080
cat >> ~/.ssh/config << 'EOF'
81-
ServerAliveInterval 30
82-
ServerAliveCountMax 3
81+
ServerAliveInterval 15
82+
ServerAliveCountMax 20
8383
TCPKeepAlive yes
84-
ConnectTimeout 10
85-
ConnectionAttempts 1
84+
ConnectTimeout 30
85+
ConnectionAttempts 3
86+
ControlMaster auto
87+
ControlPath ~/.ssh/control-%C
88+
ControlPersist 1h
89+
StrictHostKeyChecking no
8690
EOF
8791
env:
8892
SSH_KEY: ${{ secrets.ARM_SSH_KEY }}
8993
SSH_IP: ${{ fromJson(steps.scw.outputs.json).public_ip.address }}
9094
SSH_CONFIG: ${{ secrets.ARM_SSH_CONFIG }}
91-
- name: Check ARM SSH connectivity
92-
run: |
93-
echo "=== Starting enhanced SSH connectivity check ==="
94-
SSH_HOST="root@$SSH_IP"
95-
MAX_RETRIES=10
96-
RETRY_COUNT=0
97-
SUCCESS=false
98-
99-
# Enhanced SSH options for stability
100-
SSH_OPTS="-o StrictHostKeyChecking=no \
101-
-o ConnectTimeout=10 \
102-
-o ConnectionAttempts=1 \
103-
-o ServerAliveInterval=30 \
104-
-o ServerAliveCountMax=3 \
105-
-o TCPKeepAlive=yes \
106-
-o BatchMode=yes \
107-
-i ~/.ssh/id_rsa_arm"
108-
109-
while [ $RETRY_COUNT -lt $MAX_RETRIES ]; do
110-
RETRY_COUNT=$((RETRY_COUNT + 1))
111-
echo "----------------------------------------"
112-
echo "Attempt $RETRY_COUNT of $MAX_RETRIES"
113-
echo "Timestamp: $(date '+%Y-%m-%d %H:%M:%S')"
114-
115-
# First check if SSH port is reachable
116-
echo "β†’ Checking TCP port 22..."
117-
if timeout 5 bash -c "cat < /dev/null > /dev/tcp/$SSH_IP/22" 2>/dev/null; then
118-
echo "βœ“ Port 22 is open"
119-
120-
# Try SSH connection with comprehensive checks
121-
echo "β†’ Testing SSH connection..."
122-
if ssh $SSH_OPTS $SSH_HOST "echo 'βœ“ SSH connection established' && \
123-
echo 'β†’ Checking network...' && \
124-
ping -c 1 9.9.9.9 > /dev/null 2>&1 && \
125-
echo 'βœ“ Network connectivity verified (Quad9)' && \
126-
echo 'β†’ System info:' && \
127-
uptime && \
128-
free -h | head -n 2" 2>&1; then
129-
echo "βœ“βœ“βœ“ All connectivity checks passed! βœ“βœ“βœ“"
130-
SUCCESS=true
131-
break
132-
else
133-
echo "βœ— SSH command execution failed"
134-
fi
135-
else
136-
echo "βœ— Port 22 is not reachable"
137-
fi
138-
139-
# Calculate backoff delay (exponential: 5, 10, 15, 20, 30, 30, ...)
140-
if [ $RETRY_COUNT -lt $MAX_RETRIES ]; then
141-
DELAY=$((5 * RETRY_COUNT))
142-
[ $DELAY -gt 30 ] && DELAY=30
143-
echo "⏳ Waiting ${DELAY}s before retry..."
144-
sleep $DELAY
145-
fi
146-
done
147-
148-
if [ "$SUCCESS" = false ]; then
149-
echo "========================================="
150-
echo "βœ—βœ—βœ— FATAL: All $MAX_RETRIES connection attempts failed βœ—βœ—βœ—"
151-
echo "This likely indicates:"
152-
echo " - ARM node is not accessible"
153-
echo " - Network connectivity issues"
154-
echo " - SSH service not running"
155-
echo "========================================="
156-
exit 1
157-
fi
158-
159-
# Final stabilization period
160-
echo "========================================="
161-
echo "⏳ Allowing connection to stabilize (10s)..."
162-
sleep 10
163-
164-
# Final verification
165-
echo "β†’ Final connectivity verification..."
166-
if ssh $SSH_OPTS $SSH_HOST "echo 'βœ“ Final check: SSH OK'"; then
167-
echo "βœ“βœ“βœ“ SSH connectivity fully stable and ready βœ“βœ“βœ“"
168-
echo "========================================="
169-
else
170-
echo "βœ—βœ—βœ— FATAL: Final verification failed βœ—βœ—βœ—"
171-
exit 1
172-
fi
173-
env:
174-
SSH_IP: ${{ fromJson(steps.scw.outputs.json).public_ip.address }}
17595
- name: Install Docker
176-
run: ssh root@$SSH_IP "curl -fsSL https://test.docker.com -o test-docker.sh ; chmod +x test-docker.sh ; sh test-docker.sh ; echo 'ClientAliveInterval 60' >> /etc/ssh/sshd_config ; echo 'ClientAliveCountMax 0' >> /etc/ssh/sshd_config ; systemctl restart ssh"
96+
run: ssh root@$SSH_IP "curl -fsSL https://test.docker.com -o test-docker.sh ; chmod +x test-docker.sh ; sh test-docker.sh ; echo 'ClientAliveInterval 15' >> /etc/ssh/sshd_config ; echo 'ClientAliveCountMax 60' >> /etc/ssh/sshd_config ; echo 'MaxSessions 50' >> /etc/ssh/sshd_config ; echo 'MaxStartups 50:30:100' >> /etc/ssh/sshd_config ; systemctl restart ssh"
17797
env:
17898
SSH_IP: ${{ fromJson(steps.scw.outputs.json).public_ip.address }}

β€Ž.github/workflows/linux-build.ymlβ€Ž

Lines changed: 9 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -68,107 +68,22 @@ jobs:
6868
echo "$SSH_KEY" > ~/.ssh/id_rsa_arm
6969
chmod 600 ~/.ssh/id_rsa_arm
7070
echo "$SSH_CONFIG" | sed "s/SSH_IP/$SSH_IP/g" > ~/.ssh/config
71-
# Enhanced keepalive and timeout settings
71+
# Enhanced keepalive and timeout settings for long-running builds
7272
cat >> ~/.ssh/config << 'EOF'
73-
ServerAliveInterval 30
74-
ServerAliveCountMax 3
73+
ServerAliveInterval 15
74+
ServerAliveCountMax 20
7575
TCPKeepAlive yes
76-
ConnectTimeout 10
77-
ConnectionAttempts 1
76+
ConnectTimeout 30
77+
ConnectionAttempts 3
78+
ControlMaster auto
79+
ControlPath ~/.ssh/control-%C
80+
ControlPersist 1h
81+
StrictHostKeyChecking no
7882
EOF
7983
env:
8084
SSH_KEY: ${{ secrets.ARM_SSH_KEY }}
8185
SSH_IP: ${{ secrets.ARM_SSH_IP }}
8286
SSH_CONFIG: ${{ secrets.ARM_SSH_CONFIG }}
83-
- name: Check ARM SSH connectivity
84-
if: startsWith(env.ARCH, 'arm') == true
85-
run: |
86-
echo "=== Starting enhanced SSH connectivity check ==="
87-
SSH_HOST="root@$SSH_IP"
88-
MAX_RETRIES=10
89-
RETRY_COUNT=0
90-
SUCCESS=false
91-
92-
# Enhanced SSH options for stability
93-
SSH_OPTS="-o StrictHostKeyChecking=no \
94-
-o ConnectTimeout=10 \
95-
-o ConnectionAttempts=1 \
96-
-o ServerAliveInterval=30 \
97-
-o ServerAliveCountMax=3 \
98-
-o TCPKeepAlive=yes \
99-
-o BatchMode=yes \
100-
-i ~/.ssh/id_rsa_arm"
101-
102-
while [ $RETRY_COUNT -lt $MAX_RETRIES ]; do
103-
RETRY_COUNT=$((RETRY_COUNT + 1))
104-
echo "----------------------------------------"
105-
echo "Attempt $RETRY_COUNT of $MAX_RETRIES"
106-
echo "Timestamp: $(date '+%Y-%m-%d %H:%M:%S')"
107-
108-
# First check if SSH port is reachable
109-
echo "β†’ Checking TCP port 22..."
110-
if timeout 5 bash -c "cat < /dev/null > /dev/tcp/$SSH_IP/22" 2>/dev/null; then
111-
echo "βœ“ Port 22 is open"
112-
113-
# Try SSH connection with comprehensive checks
114-
echo "β†’ Testing SSH connection..."
115-
if ssh $SSH_OPTS $SSH_HOST "echo 'βœ“ SSH connection established' && \
116-
echo 'β†’ Checking Docker...' && \
117-
docker info > /dev/null 2>&1 && \
118-
echo 'βœ“ Docker is running' && \
119-
echo 'β†’ Checking network...' && \
120-
ping -c 1 9.9.9.9 > /dev/null 2>&1 && \
121-
echo 'βœ“ Network connectivity verified (Quad9)' && \
122-
echo 'β†’ System info:' && \
123-
uptime && \
124-
free -h | head -n 2" 2>&1; then
125-
echo "βœ“βœ“βœ“ All connectivity checks passed! βœ“βœ“βœ“"
126-
SUCCESS=true
127-
break
128-
else
129-
echo "βœ— SSH command execution failed"
130-
fi
131-
else
132-
echo "βœ— Port 22 is not reachable"
133-
fi
134-
135-
# Calculate backoff delay (exponential: 5, 10, 15, 20, 30, 30, ...)
136-
if [ $RETRY_COUNT -lt $MAX_RETRIES ]; then
137-
DELAY=$((5 * RETRY_COUNT))
138-
[ $DELAY -gt 30 ] && DELAY=30
139-
echo "⏳ Waiting ${DELAY}s before retry..."
140-
sleep $DELAY
141-
fi
142-
done
143-
144-
if [ "$SUCCESS" = false ]; then
145-
echo "========================================="
146-
echo "βœ—βœ—βœ— FATAL: All $MAX_RETRIES connection attempts failed βœ—βœ—βœ—"
147-
echo "This likely indicates:"
148-
echo " - ARM node is not accessible"
149-
echo " - Network connectivity issues"
150-
echo " - SSH service not running"
151-
echo " - Docker service not running"
152-
echo "========================================="
153-
exit 1
154-
fi
155-
156-
# Final stabilization period
157-
echo "========================================="
158-
echo "⏳ Allowing connection to stabilize (10s)..."
159-
sleep 10
160-
161-
# Final verification
162-
echo "β†’ Final connectivity verification..."
163-
if ssh $SSH_OPTS $SSH_HOST "echo 'βœ“ Final check: SSH OK' && docker ps > /dev/null 2>&1 && echo 'βœ“ Final check: Docker OK'"; then
164-
echo "βœ“βœ“βœ“ SSH connectivity fully stable and ready βœ“βœ“βœ“"
165-
echo "========================================="
166-
else
167-
echo "βœ—βœ—βœ— FATAL: Final verification failed βœ—βœ—βœ—"
168-
exit 1
169-
fi
170-
env:
171-
SSH_IP: ${{ secrets.ARM_SSH_IP }}
17287
- name: Setup Buildx
17388
uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # v3.11.1
17489
if: startsWith(env.ARCH, 'arm') == false

0 commit comments

Comments
Β (0)