Skip to content

Commit 004adb5

Browse files
committed
[iris] Install cloudflared in controller Docker image, run via docker exec
Move cloudflared from host VM bootstrap to the controller Dockerfile. Launch it inside the running container via `docker exec -d` over SSH, instead of as a local subprocess on the client machine. - Dockerfile: install cloudflared binary in controller stage - tunnel.py: launch/stop via docker exec on the controller VM - bootstrap.py: remove host-level cloudflared installation - vm_lifecycle.py: pass VM handle to tunnel start/stop https://claude.ai/code/session_01RyutK1NjJZXmmKdeWbGghb
1 parent d567ff5 commit 004adb5

5 files changed

Lines changed: 113 additions & 81 deletions

File tree

lib/iris/Dockerfile

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,11 @@ FROM deps AS controller
7777

7878
LABEL org.opencontainers.image.description="Iris controller image"
7979

80+
# cloudflared for Cloudflare Tunnel (public dashboard access without SSH)
81+
RUN curl -fsSL https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64 \
82+
-o /usr/local/bin/cloudflared \
83+
&& chmod +x /usr/local/bin/cloudflared
84+
8085
HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
8186
CMD curl -f http://localhost:10000/health || exit 1
8287

lib/iris/src/iris/cluster/controller/vm_lifecycle.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -380,7 +380,7 @@ def start_controller(
380380
if wait_healthy(existing_vm, port, timeout=health_check_timeout):
381381
address = f"http://{existing_vm.internal_address}:{port}"
382382
logger.info("Existing controller at %s is healthy", address)
383-
tunnel_handle = _maybe_start_tunnel(config, port)
383+
tunnel_handle = _maybe_start_tunnel(config, port, existing_vm)
384384
return address, existing_vm, tunnel_handle
385385
logger.info("Existing controller is unhealthy, terminating and recreating")
386386
existing_vm.terminate()
@@ -410,7 +410,7 @@ def start_controller(
410410
vm.set_metadata({labels.iris_controller_address: address})
411411

412412
# Start tunnel if configured
413-
tunnel_handle = _maybe_start_tunnel(config, port)
413+
tunnel_handle = _maybe_start_tunnel(config, port, vm)
414414

415415
logger.info("Controller started at %s", address)
416416
if tunnel_handle:
@@ -421,13 +421,14 @@ def start_controller(
421421
def _maybe_start_tunnel(
422422
config: config_pb2.IrisClusterConfig,
423423
port: int,
424+
vm: StandaloneWorkerHandle,
424425
) -> TunnelHandle | None:
425426
"""Start a Cloudflare Tunnel if configured. Returns None if disabled."""
426427
tunnel_config = _build_tunnel_config(config)
427428
if tunnel_config is None:
428429
return None
429430
try:
430-
return start_tunnel(tunnel_config, port)
431+
return start_tunnel(tunnel_config, port, vm)
431432
except Exception:
432433
logger.warning("Failed to start Cloudflare Tunnel — controller is still accessible via SSH", exc_info=True)
433434
return None
@@ -462,7 +463,7 @@ def restart_controller(
462463
if not wait_healthy(vm, port, timeout=health_check_timeout):
463464
raise RuntimeError(f"Controller at {address} failed health check after restart")
464465

465-
tunnel_handle = _maybe_start_tunnel(config, port)
466+
tunnel_handle = _maybe_start_tunnel(config, port, vm)
466467

467468
logger.info("Controller container restarted at %s", address)
468469
if tunnel_handle:

lib/iris/src/iris/cluster/providers/gcp/bootstrap.py

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -283,16 +283,6 @@ def build_worker_bootstrap_script(
283283
echo "[iris-controller] [1/5] Docker already installed: $(docker --version)"
284284
fi
285285
286-
# Install cloudflared for Cloudflare Tunnel support (idempotent)
287-
if ! command -v cloudflared &> /dev/null; then
288-
echo "[iris-controller] Installing cloudflared..."
289-
curl -fsSL https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64 \
290-
-o /usr/local/bin/cloudflared && chmod +x /usr/local/bin/cloudflared
291-
echo "[iris-controller] cloudflared installed: $(cloudflared --version)"
292-
else
293-
echo "[iris-controller] cloudflared already installed: $(cloudflared --version)"
294-
fi
295-
296286
echo "[iris-controller] [2/5] Ensuring Docker daemon is running..."
297287
sudo systemctl start docker || true
298288
if sudo docker info > /dev/null 2>&1; then

lib/iris/src/iris/cluster/tunnel.py

Lines changed: 68 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -7,27 +7,36 @@
77
accessible at ``marin-<nonce>.iris-ops.dev`` without a public IP address or
88
SSH port-forwarding.
99
10+
Architecture:
11+
- Cloudflare API calls (create tunnel, DNS, get token) run **client-side**
12+
(on the machine running ``iris cluster start``).
13+
- ``cloudflared`` runs **inside the controller container** (installed in the
14+
Docker image). The container uses ``--network=host``, so
15+
``localhost:<port>`` reaches the controller. It is launched via
16+
``docker exec`` from the client over SSH.
17+
1018
Lifecycle:
11-
1. ``start_tunnel()`` — create tunnel, set DNS CNAME, launch ``cloudflared``.
12-
2. ``stop_tunnel()`` — kill ``cloudflared``, delete DNS record, delete tunnel.
19+
1. ``start_tunnel()`` — create tunnel, set DNS CNAME, launch cloudflared in container.
20+
2. ``stop_tunnel()`` — kill cloudflared in container, optionally delete DNS/tunnel.
1321
"""
1422

1523
from __future__ import annotations
1624

1725
import hashlib
1826
import logging
19-
import subprocess
20-
import time
2127
from dataclasses import dataclass
2228

2329
import httpx
2430

31+
from iris.cluster.providers.types import RemoteWorkerHandle
32+
from iris.time_utils import Duration
33+
2534
logger = logging.getLogger(__name__)
2635

2736
CLOUDFLARE_API_BASE = "https://api.cloudflare.com/client/v4"
2837

29-
# How long to wait for cloudflared to establish the tunnel connection.
30-
TUNNEL_HEALTH_TIMEOUT_SECONDS = 30
38+
# Name of the cloudflared container on the controller VM.
39+
CLOUDFLARED_CONTAINER_NAME = "iris-cloudflared"
3140

3241

3342
@dataclass
@@ -66,7 +75,6 @@ class TunnelHandle:
6675
tunnel_token: str
6776
dns_record_id: str
6877
public_url: str
69-
process: subprocess.Popen | None = None
7078

7179

7280
def _cf_headers(api_token: str) -> dict[str, str]:
@@ -120,7 +128,6 @@ def _create_tunnel(
120128
api_token: str,
121129
) -> dict:
122130
"""Create a new Cloudflare Tunnel. Returns the tunnel object."""
123-
# Generate a tunnel secret (32 bytes, base64-encoded by Cloudflare)
124131
import secrets
125132

126133
tunnel_secret = secrets.token_urlsafe(32)
@@ -273,34 +280,54 @@ def _delete_tunnel(
273280
logger.info("Deleted tunnel %s", tunnel_id)
274281

275282

276-
def _launch_cloudflared(tunnel_token: str) -> subprocess.Popen:
277-
"""Launch ``cloudflared`` as a background subprocess."""
278-
cmd = [
279-
"cloudflared",
280-
"tunnel",
281-
"--no-autoupdate",
282-
"run",
283-
"--token",
284-
tunnel_token,
285-
]
286-
logger.info("Starting cloudflared tunnel connector")
287-
proc = subprocess.Popen(
288-
cmd,
289-
stdout=subprocess.PIPE,
290-
stderr=subprocess.PIPE,
283+
CONTROLLER_CONTAINER_NAME = "iris-controller"
284+
285+
286+
def _launch_cloudflared_on_vm(vm: RemoteWorkerHandle, tunnel_token: str) -> None:
287+
"""Launch ``cloudflared`` inside the controller container via ``docker exec``.
288+
289+
cloudflared is installed in the controller Docker image. The container
290+
uses ``--network=host``, so it can reach the controller at localhost.
291+
"""
292+
# Kill any existing cloudflared inside the container
293+
vm.run_command(
294+
f"sudo docker exec {CONTROLLER_CONTAINER_NAME} pkill -f 'cloudflared tunnel' || true",
295+
timeout=Duration.from_seconds(10),
296+
)
297+
298+
# Launch cloudflared in the background inside the container.
299+
cmd = (
300+
f"sudo docker exec -d {CONTROLLER_CONTAINER_NAME} "
301+
f"cloudflared tunnel --no-autoupdate run --token {tunnel_token}"
302+
)
303+
result = vm.run_command(cmd, timeout=Duration.from_seconds(15))
304+
if result.returncode != 0:
305+
raise RuntimeError(f"Failed to start cloudflared in container: {result.stderr}")
306+
307+
# Verify it's running
308+
check = vm.run_command(
309+
f"sudo docker exec {CONTROLLER_CONTAINER_NAME} pgrep -f 'cloudflared tunnel' || true",
310+
timeout=Duration.from_seconds(5),
311+
)
312+
if not check.stdout.strip():
313+
raise RuntimeError("cloudflared failed to start inside controller container")
314+
315+
logger.info("cloudflared started inside controller container")
316+
317+
318+
def _stop_cloudflared_on_vm(vm: RemoteWorkerHandle) -> None:
319+
"""Stop cloudflared inside the controller container."""
320+
vm.run_command(
321+
f"sudo docker exec {CONTROLLER_CONTAINER_NAME} pkill -f 'cloudflared tunnel' || true",
322+
timeout=Duration.from_seconds(10),
291323
)
292-
# Give cloudflared a moment to start and fail fast if binary is missing
293-
time.sleep(2)
294-
if proc.poll() is not None:
295-
stderr = proc.stderr.read().decode() if proc.stderr else ""
296-
raise RuntimeError(f"cloudflared exited immediately (rc={proc.returncode}): {stderr}")
297-
logger.info("cloudflared started (pid=%d)", proc.pid)
298-
return proc
324+
logger.info("cloudflared stopped inside controller container")
299325

300326

301-
def start_tunnel(config: TunnelConfig, controller_port: int) -> TunnelHandle:
302-
"""Create a Cloudflare Tunnel and DNS record, then launch cloudflared.
327+
def start_tunnel(config: TunnelConfig, controller_port: int, vm: RemoteWorkerHandle) -> TunnelHandle:
328+
"""Create a Cloudflare Tunnel and DNS record, then launch cloudflared on the VM.
303329
330+
Cloudflare API calls run client-side; cloudflared runs on the controller VM.
304331
Idempotent: reuses an existing tunnel with the same name if present.
305332
"""
306333
if not config.api_token:
@@ -340,8 +367,8 @@ def start_tunnel(config: TunnelConfig, controller_port: int) -> TunnelHandle:
340367
# 4. Upsert DNS CNAME
341368
dns_record_id = _upsert_dns_record(client, config.cloudflare_zone_id, fqdn, tunnel_id, config.api_token)
342369

343-
# 5. Launch cloudflared
344-
proc = _launch_cloudflared(tunnel_token)
370+
# 5. Launch cloudflared on the controller VM
371+
_launch_cloudflared_on_vm(vm, tunnel_token)
345372

346373
public_url = config.public_url
347374
logger.info("Tunnel active: %s", public_url)
@@ -351,31 +378,27 @@ def start_tunnel(config: TunnelConfig, controller_port: int) -> TunnelHandle:
351378
tunnel_token=tunnel_token,
352379
dns_record_id=dns_record_id,
353380
public_url=public_url,
354-
process=proc,
355381
)
356382

357383

358384
def stop_tunnel(
359385
handle: TunnelHandle,
360386
config: TunnelConfig,
387+
vm: RemoteWorkerHandle | None = None,
361388
delete_tunnel: bool = False,
362389
) -> None:
363-
"""Stop cloudflared and optionally clean up DNS/tunnel.
390+
"""Stop cloudflared on the VM and optionally clean up DNS/tunnel.
364391
365392
By default, DNS records and the tunnel are preserved so the same URL keeps
366393
working across controller restarts. Pass ``delete_tunnel=True`` for full
367394
cleanup (e.g. on ``iris cluster stop``).
368395
"""
369-
# Kill cloudflared process
370-
if handle.process and handle.process.poll() is None:
371-
logger.info("Stopping cloudflared (pid=%d)", handle.process.pid)
372-
handle.process.terminate()
396+
# Kill cloudflared on the VM
397+
if vm is not None:
373398
try:
374-
handle.process.wait(timeout=10)
375-
except subprocess.TimeoutExpired:
376-
handle.process.kill()
377-
handle.process.wait()
378-
logger.info("cloudflared stopped")
399+
_stop_cloudflared_on_vm(vm)
400+
except Exception:
401+
logger.warning("Failed to stop cloudflared on VM", exc_info=True)
379402

380403
if not delete_tunnel:
381404
return

lib/iris/tests/cluster/test_tunnel.py

Lines changed: 35 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import httpx
1111
import pytest
1212

13+
from iris.cluster.providers.types import CommandResult
1314
from iris.cluster.tunnel import (
1415
TunnelConfig,
1516
TunnelHandle,
@@ -21,6 +22,7 @@
2122
start_tunnel,
2223
stop_tunnel,
2324
)
25+
from iris.time_utils import Duration
2426

2527
TUNNEL_MODULE = "iris.cluster.tunnel"
2628

@@ -108,8 +110,6 @@ def test_find_dns_record_returns_match():
108110

109111
def test_upsert_dns_record_creates_new():
110112
client = MagicMock()
111-
# First call: no existing record
112-
# Second call: create succeeds
113113
client.request.side_effect = [
114114
_make_cf_response([]), # find_dns_record returns empty
115115
_make_cf_response({"id": "rec-new"}), # create returns new record
@@ -141,15 +141,34 @@ def test_configure_tunnel_ingress():
141141
assert ingress[1]["service"] == "http_status:404"
142142

143143

144+
# ---------------------------------------------------------------------------
145+
# Fake VM for testing remote cloudflared launch
146+
# ---------------------------------------------------------------------------
147+
148+
149+
class FakeRemoteVM:
150+
"""Minimal fake implementing run_command for tunnel tests."""
151+
152+
def __init__(self, healthy: bool = True):
153+
self.commands: list[str] = []
154+
self._healthy = healthy
155+
156+
def run_command(self, command: str, timeout: Duration | None = None, on_line=None) -> CommandResult:
157+
self.commands.append(command)
158+
if "pgrep" in command and self._healthy:
159+
return CommandResult(returncode=0, stdout="12345", stderr="")
160+
return CommandResult(returncode=0, stdout="", stderr="")
161+
162+
144163
# ---------------------------------------------------------------------------
145164
# start_tunnel / stop_tunnel integration (mocked)
146165
# ---------------------------------------------------------------------------
147166

148167

149-
@patch(f"{TUNNEL_MODULE}._launch_cloudflared")
168+
@patch(f"{TUNNEL_MODULE}._launch_cloudflared_on_vm")
150169
@patch(f"{TUNNEL_MODULE}.httpx.Client")
151170
def test_start_tunnel_full_flow(mock_client_cls, mock_launch):
152-
"""start_tunnel creates tunnel, configures ingress, sets DNS, launches cloudflared."""
171+
"""start_tunnel creates tunnel, configures ingress, sets DNS, launches cloudflared on VM."""
153172
client = MagicMock()
154173
mock_client_cls.return_value.__enter__ = MagicMock(return_value=client)
155174
mock_client_cls.return_value.__exit__ = MagicMock(return_value=False)
@@ -163,10 +182,7 @@ def test_start_tunnel_full_flow(mock_client_cls, mock_launch):
163182
_make_cf_response({"id": "rec-xyz"}), # create dns record
164183
]
165184

166-
mock_proc = MagicMock()
167-
mock_proc.pid = 12345
168-
mock_launch.return_value = mock_proc
169-
185+
vm = FakeRemoteVM()
170186
config = TunnelConfig(
171187
enabled=True,
172188
domain="iris-ops.dev",
@@ -176,41 +192,39 @@ def test_start_tunnel_full_flow(mock_client_cls, mock_launch):
176192
cluster_name="test-cluster",
177193
)
178194

179-
handle = start_tunnel(config, controller_port=10000)
195+
handle = start_tunnel(config, controller_port=10000, vm=vm)
180196

181197
assert handle.tunnel_id == "tun-abc"
182198
assert handle.dns_record_id == "rec-xyz"
183199
assert handle.public_url == config.public_url
184-
assert handle.process is mock_proc
185-
mock_launch.assert_called_once()
200+
mock_launch.assert_called_once_with(vm, "tunnel-token-string")
186201

187202

188203
def test_start_tunnel_validates_required_fields():
204+
vm = FakeRemoteVM()
189205
config = TunnelConfig(enabled=True, api_token="", cloudflare_account_id="", cloudflare_zone_id="")
190206
with pytest.raises(ValueError, match="api_token"):
191-
start_tunnel(config, controller_port=10000)
207+
start_tunnel(config, controller_port=10000, vm=vm)
192208

193209
config2 = TunnelConfig(enabled=True, api_token="tok", cloudflare_account_id="", cloudflare_zone_id="")
194210
with pytest.raises(ValueError, match="account_id"):
195-
start_tunnel(config2, controller_port=10000)
211+
start_tunnel(config2, controller_port=10000, vm=vm)
196212

197213

198-
def test_stop_tunnel_terminates_process():
199-
proc = MagicMock()
200-
proc.poll.return_value = None # still running
214+
def test_stop_tunnel_kills_cloudflared_on_vm():
215+
vm = FakeRemoteVM()
201216
handle = TunnelHandle(
202217
tunnel_id="tun-1",
203218
tunnel_token="tok",
204219
dns_record_id="rec-1",
205220
public_url="https://test.iris-ops.dev",
206-
process=proc,
207221
)
208222
config = TunnelConfig(api_token="fake", cloudflare_account_id="acct", cloudflare_zone_id="zone")
209223

210-
stop_tunnel(handle, config, delete_tunnel=False)
224+
stop_tunnel(handle, config, vm=vm, delete_tunnel=False)
211225

212-
proc.terminate.assert_called_once()
213-
proc.wait.assert_called_once()
226+
# Should have issued a pkill command
227+
assert any("pkill" in cmd for cmd in vm.commands)
214228

215229

216230
@patch(f"{TUNNEL_MODULE}.httpx.Client")
@@ -230,10 +244,9 @@ def test_stop_tunnel_with_cleanup(mock_client_cls):
230244
tunnel_token="tok",
231245
dns_record_id="rec-1",
232246
public_url="https://test.iris-ops.dev",
233-
process=None,
234247
)
235248
config = TunnelConfig(api_token="fake", cloudflare_account_id="acct", cloudflare_zone_id="zone")
236249

237-
stop_tunnel(handle, config, delete_tunnel=True)
250+
stop_tunnel(handle, config, vm=None, delete_tunnel=True)
238251

239252
assert client.request.call_count == 2

0 commit comments

Comments
 (0)