Skip to content

Commit ae9049f

Browse files
fix: resolve Docker startup race causing "Gateway hello timed out"
The bridge and gateway were started concurrently via `concurrently`, but on cold Docker starts the gateway often took >30s to initialize (MCP, cron, channels), causing the bridge to timeout with "Gateway hello timed out" (issue #104). Changes: - docker-entrypoint.sh: start gateway first, poll /health before launching bridge; delete stale server-token on startup - pilotdeck-bridge.js: make connect timeout configurable via PILOTDECK_BRIDGE_TIMEOUT env var (default 60s, was hardcoded 30s) - GatewayWsClient.ts: properly reject hello promise on WS close (e.g. auth_failed) instead of waiting for the full timeout; increase hello timeout from 5s to 10s Closes #104 Co-authored-by: Cursor <cursoragent@cursor.com>
1 parent 7ec1f25 commit ae9049f

3 files changed

Lines changed: 64 additions & 7 deletions

File tree

docker-entrypoint.sh

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -150,9 +150,45 @@ echo "[pilotdeck-docker] Starting PilotDeck (gateway + UI server)..."
150150
echo "[pilotdeck-docker] Config: $CONFIG_FILE"
151151
echo "[pilotdeck-docker] UI will be available at http://0.0.0.0:${SERVER_PORT:-3001}"
152152

153+
# ── Remove stale auth token so the bridge never uses a leftover value ──
154+
rm -f "$PILOT_HOME/server-token"
155+
153156
# ── Start gateway + UI server via concurrently ────────────────────────
157+
# The bridge retries for PILOTDECK_BRIDGE_TIMEOUT ms (default 30s) which
158+
# may be too short on cold Docker starts. We first wait for the gateway
159+
# health endpoint before launching the bridge, eliminating the race.
154160
cd /app
155161

156-
exec npx concurrently --kill-others --names gateway,server \
157-
"node dist/src/cli/pilotdeck.js server" \
158-
"node --import tsx ui/server/index.js"
162+
GATEWAY_PORT="${PILOTDECK_GATEWAY_PORT:-18789}"
163+
GATEWAY_HEALTH_URL="http://127.0.0.1:${GATEWAY_PORT}/health"
164+
GATEWAY_READY_TIMEOUT="${PILOTDECK_GATEWAY_READY_TIMEOUT:-120}"
165+
166+
wait_for_gateway() {
167+
echo "[pilotdeck-docker] Waiting for gateway to become ready (timeout=${GATEWAY_READY_TIMEOUT}s)..."
168+
local elapsed=0
169+
while [ "$elapsed" -lt "$GATEWAY_READY_TIMEOUT" ]; do
170+
if curl -sf "$GATEWAY_HEALTH_URL" > /dev/null 2>&1; then
171+
echo "[pilotdeck-docker] Gateway is ready (took ${elapsed}s)."
172+
return 0
173+
fi
174+
sleep 1
175+
elapsed=$((elapsed + 1))
176+
done
177+
echo "[pilotdeck-docker] WARNING: Gateway did not become ready within ${GATEWAY_READY_TIMEOUT}s, starting bridge anyway." >&2
178+
return 0
179+
}
180+
181+
node dist/src/cli/pilotdeck.js server &
182+
GATEWAY_PID=$!
183+
184+
wait_for_gateway
185+
186+
node --import tsx ui/server/index.js &
187+
BRIDGE_PID=$!
188+
189+
# If either process exits, kill the other and propagate the exit code.
190+
wait -n $GATEWAY_PID $BRIDGE_PID 2>/dev/null
191+
EXIT_CODE=$?
192+
kill $GATEWAY_PID $BRIDGE_PID 2>/dev/null
193+
wait $GATEWAY_PID $BRIDGE_PID 2>/dev/null
194+
exit $EXIT_CODE

src/gateway/client/GatewayWsClient.ts

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,13 +70,33 @@ export class GatewayWsClient {
7070
);
7171

7272
return new Promise((resolve, reject) => {
73-
const timeout = setTimeout(() => reject(new Error("Gateway hello timed out.")), 5000);
73+
let settled = false;
74+
const timeout = setTimeout(() => {
75+
if (!settled) {
76+
settled = true;
77+
reject(new Error("Gateway hello timed out."));
78+
}
79+
}, 10_000);
80+
81+
const onClose = (event: CloseEvent) => {
82+
if (!settled) {
83+
settled = true;
84+
clearTimeout(timeout);
85+
const reason = event.reason || `code ${event.code}`;
86+
reject(new Error(`Gateway closed during hello: ${reason}`));
87+
}
88+
};
89+
ws.addEventListener("close", onClose, { once: true });
90+
7491
const onHello = () => {
92+
if (settled) return;
7593
if (this.hello) {
94+
settled = true;
7695
clearTimeout(timeout);
96+
ws.removeEventListener("close", onClose);
7797
resolve(this.hello);
7898
} else {
79-
setTimeout(onHello, 0);
99+
setTimeout(onHello, 50);
80100
}
81101
};
82102
onHello();

ui/server/pilotdeck-bridge.js

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,8 +71,9 @@ const GATEWAY_TOKEN_PATH =
7171
// parallel by `concurrently`. We allow up to 30 s for the gateway to
7272
// come up before failing the first call — covers cold MCP startup on
7373
// slower machines.
74-
const GATEWAY_CONNECT_TIMEOUT_MS = 30_000;
75-
const GATEWAY_CONNECT_RETRY_INTERVAL_MS = 250;
74+
const GATEWAY_CONNECT_TIMEOUT_MS =
75+
Number.parseInt(process.env.PILOTDECK_BRIDGE_TIMEOUT ?? '', 10) || 60_000;
76+
const GATEWAY_CONNECT_RETRY_INTERVAL_MS = 500;
7677
const subagentActivityStarts = new Map();
7778

7879
function normalizeToolDisplayName(name) {

0 commit comments

Comments
 (0)