Skip to content

Commit 2743e5f

Browse files
Merge pull request #2847 from bunkerity/dev
feat: Enhance SSH connectivity checks with improved timeout and keepalive settings
2 parents 48586a2 + ac0fa0d commit 2743e5f

4 files changed

Lines changed: 360 additions & 50 deletions

File tree

β€Ž.github/workflows/container-build.ymlβ€Ž

Lines changed: 90 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -59,28 +59,105 @@ jobs:
5959
echo "$SSH_KEY" > ~/.ssh/id_rsa_arm
6060
chmod 600 ~/.ssh/id_rsa_arm
6161
echo "$SSH_CONFIG" | sed "s/SSH_IP/$SSH_IP/g" > ~/.ssh/config
62-
echo "ServerAliveInterval 60" >> ~/.ssh/config
63-
echo "ServerAliveCountMax 10" >> ~/.ssh/config
62+
# Enhanced keepalive and timeout settings
63+
cat >> ~/.ssh/config << 'EOF'
64+
ServerAliveInterval 30
65+
ServerAliveCountMax 3
66+
TCPKeepAlive yes
67+
ConnectTimeout 10
68+
ConnectionAttempts 1
69+
EOF
6470
env:
6571
SSH_KEY: ${{ secrets.ARM_SSH_KEY }}
6672
SSH_IP: ${{ secrets.ARM_SSH_IP }}
6773
SSH_CONFIG: ${{ secrets.ARM_SSH_CONFIG }}
6874
- name: Check ARM SSH connectivity
6975
if: startsWith(inputs.CACHE_SUFFIX, 'arm')
7076
run: |
71-
echo "Testing SSH connectivity to ARM node..."
72-
for i in {1..5}; do
73-
if ssh -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa_arm root@$SSH_IP "echo 'Connection test $i successful' && docker info > /dev/null 2>&1 && echo 'Docker is running'"; then
74-
echo "βœ“ SSH connection and Docker verified"
75-
break
77+
echo "=== Starting enhanced SSH connectivity check ==="
78+
SSH_HOST="root@$SSH_IP"
79+
MAX_RETRIES=10
80+
RETRY_COUNT=0
81+
SUCCESS=false
82+
83+
# Enhanced SSH options for stability
84+
SSH_OPTS="-o StrictHostKeyChecking=no \
85+
-o ConnectTimeout=10 \
86+
-o ConnectionAttempts=1 \
87+
-o ServerAliveInterval=30 \
88+
-o ServerAliveCountMax=3 \
89+
-o TCPKeepAlive=yes \
90+
-o BatchMode=yes \
91+
-i ~/.ssh/id_rsa_arm"
92+
93+
while [ $RETRY_COUNT -lt $MAX_RETRIES ]; do
94+
RETRY_COUNT=$((RETRY_COUNT + 1))
95+
echo "----------------------------------------"
96+
echo "Attempt $RETRY_COUNT of $MAX_RETRIES"
97+
echo "Timestamp: $(date '+%Y-%m-%d %H:%M:%S')"
98+
99+
# First check if SSH port is reachable
100+
echo "β†’ Checking TCP port 22..."
101+
if timeout 5 bash -c "cat < /dev/null > /dev/tcp/$SSH_IP/22" 2>/dev/null; then
102+
echo "βœ“ Port 22 is open"
103+
104+
# Try SSH connection with comprehensive checks
105+
echo "β†’ Testing SSH connection..."
106+
if ssh $SSH_OPTS $SSH_HOST "echo 'βœ“ SSH connection established' && \
107+
echo 'β†’ Checking Docker...' && \
108+
docker info > /dev/null 2>&1 && \
109+
echo 'βœ“ Docker is running' && \
110+
echo 'β†’ Checking network...' && \
111+
ping -c 1 9.9.9.9 > /dev/null 2>&1 && \
112+
echo 'βœ“ Network connectivity verified (Quad9)' && \
113+
echo 'β†’ System info:' && \
114+
uptime && \
115+
free -h | head -n 2" 2>&1; then
116+
echo "βœ“βœ“βœ“ All connectivity checks passed! βœ“βœ“βœ“"
117+
SUCCESS=true
118+
break
119+
else
120+
echo "βœ— SSH command execution failed"
121+
fi
122+
else
123+
echo "βœ— Port 22 is not reachable"
124+
fi
125+
126+
# Calculate backoff delay (exponential: 5, 10, 15, 20, 30, 30, ...)
127+
if [ $RETRY_COUNT -lt $MAX_RETRIES ]; then
128+
DELAY=$((5 * RETRY_COUNT))
129+
[ $DELAY -gt 30 ] && DELAY=30
130+
echo "⏳ Waiting ${DELAY}s before retry..."
131+
sleep $DELAY
76132
fi
77-
echo "Retry $i/5 failed, waiting 10 seconds..."
78-
sleep 10
79133
done
80-
echo "Waiting for connection to stabilize..."
81-
sleep 5
82-
echo "Final connectivity check..."
83-
ssh -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa_arm root@$SSH_IP "echo 'Final check OK' && uptime"
134+
135+
if [ "$SUCCESS" = false ]; then
136+
echo "========================================="
137+
echo "βœ—βœ—βœ— FATAL: All $MAX_RETRIES connection attempts failed βœ—βœ—βœ—"
138+
echo "This likely indicates:"
139+
echo " - ARM node is not accessible"
140+
echo " - Network connectivity issues"
141+
echo " - SSH service not running"
142+
echo " - Docker service not running"
143+
echo "========================================="
144+
exit 1
145+
fi
146+
147+
# Final stabilization period
148+
echo "========================================="
149+
echo "⏳ Allowing connection to stabilize (10s)..."
150+
sleep 10
151+
152+
# Final verification
153+
echo "β†’ Final connectivity verification..."
154+
if ssh $SSH_OPTS $SSH_HOST "echo 'βœ“ Final check: SSH OK' && docker ps > /dev/null 2>&1 && echo 'βœ“ Final check: Docker OK'"; then
155+
echo "βœ“βœ“βœ“ SSH connectivity fully stable and ready βœ“βœ“βœ“"
156+
echo "========================================="
157+
else
158+
echo "βœ—βœ—βœ— FATAL: Final verification failed βœ—βœ—βœ—"
159+
exit 1
160+
fi
84161
env:
85162
SSH_IP: ${{ secrets.ARM_SSH_IP }}
86163
- name: Setup Buildx

β€Ž.github/workflows/create-arm.ymlβ€Ž

Lines changed: 90 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -76,25 +76,104 @@ jobs:
7676
echo "$SSH_KEY" > ~/.ssh/id_rsa_arm
7777
chmod 600 ~/.ssh/id_rsa_arm
7878
echo "$SSH_CONFIG" | sed "s/SSH_IP/$SSH_IP/g" > ~/.ssh/config
79+
# Enhanced keepalive and timeout settings
80+
cat >> ~/.ssh/config << 'EOF'
81+
ServerAliveInterval 30
82+
ServerAliveCountMax 3
83+
TCPKeepAlive yes
84+
ConnectTimeout 10
85+
ConnectionAttempts 1
86+
EOF
7987
env:
8088
SSH_KEY: ${{ secrets.ARM_SSH_KEY }}
8189
SSH_IP: ${{ fromJson(steps.scw.outputs.json).public_ip.address }}
8290
SSH_CONFIG: ${{ secrets.ARM_SSH_CONFIG }}
8391
- name: Check ARM SSH connectivity
8492
run: |
85-
echo "Testing SSH connectivity to ARM node..."
86-
for i in {1..5}; do
87-
if ssh -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa_arm root@$SSH_IP "echo 'Connection test $i successful' && docker info > /dev/null 2>&1 && echo 'Docker is running'"; then
88-
echo "βœ“ SSH connection and Docker verified"
89-
break
93+
echo "=== Starting enhanced SSH connectivity check ==="
94+
SSH_HOST="root@$SSH_IP"
95+
MAX_RETRIES=10
96+
RETRY_COUNT=0
97+
SUCCESS=false
98+
99+
# Enhanced SSH options for stability
100+
SSH_OPTS="-o StrictHostKeyChecking=no \
101+
-o ConnectTimeout=10 \
102+
-o ConnectionAttempts=1 \
103+
-o ServerAliveInterval=30 \
104+
-o ServerAliveCountMax=3 \
105+
-o TCPKeepAlive=yes \
106+
-o BatchMode=yes \
107+
-i ~/.ssh/id_rsa_arm"
108+
109+
while [ $RETRY_COUNT -lt $MAX_RETRIES ]; do
110+
RETRY_COUNT=$((RETRY_COUNT + 1))
111+
echo "----------------------------------------"
112+
echo "Attempt $RETRY_COUNT of $MAX_RETRIES"
113+
echo "Timestamp: $(date '+%Y-%m-%d %H:%M:%S')"
114+
115+
# First check if SSH port is reachable
116+
echo "β†’ Checking TCP port 22..."
117+
if timeout 5 bash -c "cat < /dev/null > /dev/tcp/$SSH_IP/22" 2>/dev/null; then
118+
echo "βœ“ Port 22 is open"
119+
120+
# Try SSH connection with comprehensive checks
121+
echo "β†’ Testing SSH connection..."
122+
if ssh $SSH_OPTS $SSH_HOST "echo 'βœ“ SSH connection established' && \
123+
echo 'β†’ Checking Docker...' && \
124+
docker info > /dev/null 2>&1 && \
125+
echo 'βœ“ Docker is running' && \
126+
echo 'β†’ Checking network...' && \
127+
ping -c 1 9.9.9.9 > /dev/null 2>&1 && \
128+
echo 'βœ“ Network connectivity verified (Quad9)' && \
129+
echo 'β†’ System info:' && \
130+
uptime && \
131+
free -h | head -n 2" 2>&1; then
132+
echo "βœ“βœ“βœ“ All connectivity checks passed! βœ“βœ“βœ“"
133+
SUCCESS=true
134+
break
135+
else
136+
echo "βœ— SSH command execution failed"
137+
fi
138+
else
139+
echo "βœ— Port 22 is not reachable"
140+
fi
141+
142+
# Calculate backoff delay (exponential: 5, 10, 15, 20, 30, 30, ...)
143+
if [ $RETRY_COUNT -lt $MAX_RETRIES ]; then
144+
DELAY=$((5 * RETRY_COUNT))
145+
[ $DELAY -gt 30 ] && DELAY=30
146+
echo "⏳ Waiting ${DELAY}s before retry..."
147+
sleep $DELAY
90148
fi
91-
echo "Retry $i/5 failed, waiting 10 seconds..."
92-
sleep 10
93149
done
94-
echo "Waiting for connection to stabilize..."
95-
sleep 5
96-
echo "Final connectivity check..."
97-
ssh -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa_arm root@$SSH_IP "echo 'Final check OK' && uptime"
150+
151+
if [ "$SUCCESS" = false ]; then
152+
echo "========================================="
153+
echo "βœ—βœ—βœ— FATAL: All $MAX_RETRIES connection attempts failed βœ—βœ—βœ—"
154+
echo "This likely indicates:"
155+
echo " - ARM node is not accessible"
156+
echo " - Network connectivity issues"
157+
echo " - SSH service not running"
158+
echo " - Docker service not running"
159+
echo "========================================="
160+
exit 1
161+
fi
162+
163+
# Final stabilization period
164+
echo "========================================="
165+
echo "⏳ Allowing connection to stabilize (10s)..."
166+
sleep 10
167+
168+
# Final verification
169+
echo "β†’ Final connectivity verification..."
170+
if ssh $SSH_OPTS $SSH_HOST "echo 'βœ“ Final check: SSH OK' && docker ps > /dev/null 2>&1 && echo 'βœ“ Final check: Docker OK'"; then
171+
echo "βœ“βœ“βœ“ SSH connectivity fully stable and ready βœ“βœ“βœ“"
172+
echo "========================================="
173+
else
174+
echo "βœ—βœ—βœ— FATAL: Final verification failed βœ—βœ—βœ—"
175+
exit 1
176+
fi
98177
env:
99178
SSH_IP: ${{ fromJson(steps.scw.outputs.json).public_ip.address }}
100179
- name: Install Docker

β€Ž.github/workflows/linux-build.ymlβ€Ž

Lines changed: 90 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -68,28 +68,105 @@ jobs:
6868
echo "$SSH_KEY" > ~/.ssh/id_rsa_arm
6969
chmod 600 ~/.ssh/id_rsa_arm
7070
echo "$SSH_CONFIG" | sed "s/SSH_IP/$SSH_IP/g" > ~/.ssh/config
71-
echo "ServerAliveInterval 60" >> ~/.ssh/config
72-
echo "ServerAliveCountMax 10" >> ~/.ssh/config
71+
# Enhanced keepalive and timeout settings
72+
cat >> ~/.ssh/config << 'EOF'
73+
ServerAliveInterval 30
74+
ServerAliveCountMax 3
75+
TCPKeepAlive yes
76+
ConnectTimeout 10
77+
ConnectionAttempts 1
78+
EOF
7379
env:
7480
SSH_KEY: ${{ secrets.ARM_SSH_KEY }}
7581
SSH_IP: ${{ secrets.ARM_SSH_IP }}
7682
SSH_CONFIG: ${{ secrets.ARM_SSH_CONFIG }}
7783
- name: Check ARM SSH connectivity
7884
if: startsWith(env.ARCH, 'arm') == true
7985
run: |
80-
echo "Testing SSH connectivity to ARM node..."
81-
for i in {1..5}; do
82-
if ssh -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa_arm root@$SSH_IP "echo 'Connection test $i successful' && docker info > /dev/null 2>&1 && echo 'Docker is running'"; then
83-
echo "βœ“ SSH connection and Docker verified"
84-
break
86+
echo "=== Starting enhanced SSH connectivity check ==="
87+
SSH_HOST="root@$SSH_IP"
88+
MAX_RETRIES=10
89+
RETRY_COUNT=0
90+
SUCCESS=false
91+
92+
# Enhanced SSH options for stability
93+
SSH_OPTS="-o StrictHostKeyChecking=no \
94+
-o ConnectTimeout=10 \
95+
-o ConnectionAttempts=1 \
96+
-o ServerAliveInterval=30 \
97+
-o ServerAliveCountMax=3 \
98+
-o TCPKeepAlive=yes \
99+
-o BatchMode=yes \
100+
-i ~/.ssh/id_rsa_arm"
101+
102+
while [ $RETRY_COUNT -lt $MAX_RETRIES ]; do
103+
RETRY_COUNT=$((RETRY_COUNT + 1))
104+
echo "----------------------------------------"
105+
echo "Attempt $RETRY_COUNT of $MAX_RETRIES"
106+
echo "Timestamp: $(date '+%Y-%m-%d %H:%M:%S')"
107+
108+
# First check if SSH port is reachable
109+
echo "β†’ Checking TCP port 22..."
110+
if timeout 5 bash -c "cat < /dev/null > /dev/tcp/$SSH_IP/22" 2>/dev/null; then
111+
echo "βœ“ Port 22 is open"
112+
113+
# Try SSH connection with comprehensive checks
114+
echo "β†’ Testing SSH connection..."
115+
if ssh $SSH_OPTS $SSH_HOST "echo 'βœ“ SSH connection established' && \
116+
echo 'β†’ Checking Docker...' && \
117+
docker info > /dev/null 2>&1 && \
118+
echo 'βœ“ Docker is running' && \
119+
echo 'β†’ Checking network...' && \
120+
ping -c 1 9.9.9.9 > /dev/null 2>&1 && \
121+
echo 'βœ“ Network connectivity verified (Quad9)' && \
122+
echo 'β†’ System info:' && \
123+
uptime && \
124+
free -h | head -n 2" 2>&1; then
125+
echo "βœ“βœ“βœ“ All connectivity checks passed! βœ“βœ“βœ“"
126+
SUCCESS=true
127+
break
128+
else
129+
echo "βœ— SSH command execution failed"
130+
fi
131+
else
132+
echo "βœ— Port 22 is not reachable"
133+
fi
134+
135+
# Calculate backoff delay (exponential: 5, 10, 15, 20, 30, 30, ...)
136+
if [ $RETRY_COUNT -lt $MAX_RETRIES ]; then
137+
DELAY=$((5 * RETRY_COUNT))
138+
[ $DELAY -gt 30 ] && DELAY=30
139+
echo "⏳ Waiting ${DELAY}s before retry..."
140+
sleep $DELAY
85141
fi
86-
echo "Retry $i/5 failed, waiting 10 seconds..."
87-
sleep 10
88142
done
89-
echo "Waiting for connection to stabilize..."
90-
sleep 5
91-
echo "Final connectivity check..."
92-
ssh -o StrictHostKeyChecking=no -i ~/.ssh/id_rsa_arm root@$SSH_IP "echo 'Final check OK' && uptime"
143+
144+
if [ "$SUCCESS" = false ]; then
145+
echo "========================================="
146+
echo "βœ—βœ—βœ— FATAL: All $MAX_RETRIES connection attempts failed βœ—βœ—βœ—"
147+
echo "This likely indicates:"
148+
echo " - ARM node is not accessible"
149+
echo " - Network connectivity issues"
150+
echo " - SSH service not running"
151+
echo " - Docker service not running"
152+
echo "========================================="
153+
exit 1
154+
fi
155+
156+
# Final stabilization period
157+
echo "========================================="
158+
echo "⏳ Allowing connection to stabilize (10s)..."
159+
sleep 10
160+
161+
# Final verification
162+
echo "β†’ Final connectivity verification..."
163+
if ssh $SSH_OPTS $SSH_HOST "echo 'βœ“ Final check: SSH OK' && docker ps > /dev/null 2>&1 && echo 'βœ“ Final check: Docker OK'"; then
164+
echo "βœ“βœ“βœ“ SSH connectivity fully stable and ready βœ“βœ“βœ“"
165+
echo "========================================="
166+
else
167+
echo "βœ—βœ—βœ— FATAL: Final verification failed βœ—βœ—βœ—"
168+
exit 1
169+
fi
93170
env:
94171
SSH_IP: ${{ secrets.ARM_SSH_IP }}
95172
- name: Setup Buildx

0 commit comments

Comments
Β (0)