77accessible at ``marin-<nonce>.iris-ops.dev`` without a public IP address or
88SSH port-forwarding.
99
10+ Architecture:
11+ - Cloudflare API calls (create tunnel, DNS, get token) run **client-side**
12+ (on the machine running ``iris cluster start``).
13+ - ``cloudflared`` runs **inside the controller container** (installed in the
14+ Docker image). The container uses ``--network=host``, so
15+ ``localhost:<port>`` reaches the controller. It is launched via
16+ ``docker exec`` from the client over SSH.
17+
1018Lifecycle:
11- 1. ``start_tunnel()`` — create tunnel, set DNS CNAME, launch `` cloudflared`` .
12- 2. ``stop_tunnel()`` — kill `` cloudflared``, delete DNS record, delete tunnel.
19+ 1. ``start_tunnel()`` — create tunnel, set DNS CNAME, launch cloudflared in container .
20+ 2. ``stop_tunnel()`` — kill cloudflared in container, optionally delete DNS/ tunnel.
1321"""
1422
1523from __future__ import annotations
1624
1725import hashlib
1826import logging
19- import subprocess
20- import time
2127from dataclasses import dataclass
2228
2329import httpx
2430
31+ from iris .cluster .providers .types import RemoteWorkerHandle
32+ from iris .time_utils import Duration
33+
2534logger = logging .getLogger (__name__ )
2635
2736CLOUDFLARE_API_BASE = "https://api.cloudflare.com/client/v4"
2837
29- # How long to wait for cloudflared to establish the tunnel connection .
30- TUNNEL_HEALTH_TIMEOUT_SECONDS = 30
38+ # Name of the cloudflared container on the controller VM .
39+ CLOUDFLARED_CONTAINER_NAME = "iris-cloudflared"
3140
3241
3342@dataclass
@@ -66,7 +75,6 @@ class TunnelHandle:
6675 tunnel_token : str
6776 dns_record_id : str
6877 public_url : str
69- process : subprocess .Popen | None = None
7078
7179
7280def _cf_headers (api_token : str ) -> dict [str , str ]:
@@ -120,7 +128,6 @@ def _create_tunnel(
120128 api_token : str ,
121129) -> dict :
122130 """Create a new Cloudflare Tunnel. Returns the tunnel object."""
123- # Generate a tunnel secret (32 bytes, base64-encoded by Cloudflare)
124131 import secrets
125132
126133 tunnel_secret = secrets .token_urlsafe (32 )
@@ -273,34 +280,54 @@ def _delete_tunnel(
273280 logger .info ("Deleted tunnel %s" , tunnel_id )
274281
275282
276- def _launch_cloudflared (tunnel_token : str ) -> subprocess .Popen :
277- """Launch ``cloudflared`` as a background subprocess."""
278- cmd = [
279- "cloudflared" ,
280- "tunnel" ,
281- "--no-autoupdate" ,
282- "run" ,
283- "--token" ,
284- tunnel_token ,
285- ]
286- logger .info ("Starting cloudflared tunnel connector" )
287- proc = subprocess .Popen (
288- cmd ,
289- stdout = subprocess .PIPE ,
290- stderr = subprocess .PIPE ,
283+ CONTROLLER_CONTAINER_NAME = "iris-controller"
284+
285+
286+ def _launch_cloudflared_on_vm (vm : RemoteWorkerHandle , tunnel_token : str ) -> None :
287+ """Launch ``cloudflared`` inside the controller container via ``docker exec``.
288+
289+ cloudflared is installed in the controller Docker image. The container
290+ uses ``--network=host``, so it can reach the controller at localhost.
291+ """
292+ # Kill any existing cloudflared inside the container
293+ vm .run_command (
294+ f"sudo docker exec { CONTROLLER_CONTAINER_NAME } pkill -f 'cloudflared tunnel' || true" ,
295+ timeout = Duration .from_seconds (10 ),
296+ )
297+
298+ # Launch cloudflared in the background inside the container.
299+ cmd = (
300+ f"sudo docker exec -d { CONTROLLER_CONTAINER_NAME } "
301+ f"cloudflared tunnel --no-autoupdate run --token { tunnel_token } "
302+ )
303+ result = vm .run_command (cmd , timeout = Duration .from_seconds (15 ))
304+ if result .returncode != 0 :
305+ raise RuntimeError (f"Failed to start cloudflared in container: { result .stderr } " )
306+
307+ # Verify it's running
308+ check = vm .run_command (
309+ f"sudo docker exec { CONTROLLER_CONTAINER_NAME } pgrep -f 'cloudflared tunnel' || true" ,
310+ timeout = Duration .from_seconds (5 ),
311+ )
312+ if not check .stdout .strip ():
313+ raise RuntimeError ("cloudflared failed to start inside controller container" )
314+
315+ logger .info ("cloudflared started inside controller container" )
316+
317+
318+ def _stop_cloudflared_on_vm (vm : RemoteWorkerHandle ) -> None :
319+ """Stop cloudflared inside the controller container."""
320+ vm .run_command (
321+ f"sudo docker exec { CONTROLLER_CONTAINER_NAME } pkill -f 'cloudflared tunnel' || true" ,
322+ timeout = Duration .from_seconds (10 ),
291323 )
292- # Give cloudflared a moment to start and fail fast if binary is missing
293- time .sleep (2 )
294- if proc .poll () is not None :
295- stderr = proc .stderr .read ().decode () if proc .stderr else ""
296- raise RuntimeError (f"cloudflared exited immediately (rc={ proc .returncode } ): { stderr } " )
297- logger .info ("cloudflared started (pid=%d)" , proc .pid )
298- return proc
324+ logger .info ("cloudflared stopped inside controller container" )
299325
300326
301- def start_tunnel (config : TunnelConfig , controller_port : int ) -> TunnelHandle :
302- """Create a Cloudflare Tunnel and DNS record, then launch cloudflared.
327+ def start_tunnel (config : TunnelConfig , controller_port : int , vm : RemoteWorkerHandle ) -> TunnelHandle :
328+ """Create a Cloudflare Tunnel and DNS record, then launch cloudflared on the VM .
303329
330+ Cloudflare API calls run client-side; cloudflared runs on the controller VM.
304331 Idempotent: reuses an existing tunnel with the same name if present.
305332 """
306333 if not config .api_token :
@@ -340,8 +367,8 @@ def start_tunnel(config: TunnelConfig, controller_port: int) -> TunnelHandle:
340367 # 4. Upsert DNS CNAME
341368 dns_record_id = _upsert_dns_record (client , config .cloudflare_zone_id , fqdn , tunnel_id , config .api_token )
342369
343- # 5. Launch cloudflared
344- proc = _launch_cloudflared ( tunnel_token )
370+ # 5. Launch cloudflared on the controller VM
371+ _launch_cloudflared_on_vm ( vm , tunnel_token )
345372
346373 public_url = config .public_url
347374 logger .info ("Tunnel active: %s" , public_url )
@@ -351,31 +378,27 @@ def start_tunnel(config: TunnelConfig, controller_port: int) -> TunnelHandle:
351378 tunnel_token = tunnel_token ,
352379 dns_record_id = dns_record_id ,
353380 public_url = public_url ,
354- process = proc ,
355381 )
356382
357383
358384def stop_tunnel (
359385 handle : TunnelHandle ,
360386 config : TunnelConfig ,
387+ vm : RemoteWorkerHandle | None = None ,
361388 delete_tunnel : bool = False ,
362389) -> None :
363- """Stop cloudflared and optionally clean up DNS/tunnel.
390+ """Stop cloudflared on the VM and optionally clean up DNS/tunnel.
364391
365392 By default, DNS records and the tunnel are preserved so the same URL keeps
366393 working across controller restarts. Pass ``delete_tunnel=True`` for full
367394 cleanup (e.g. on ``iris cluster stop``).
368395 """
369- # Kill cloudflared process
370- if handle .process and handle .process .poll () is None :
371- logger .info ("Stopping cloudflared (pid=%d)" , handle .process .pid )
372- handle .process .terminate ()
396+ # Kill cloudflared on the VM
397+ if vm is not None :
373398 try :
374- handle .process .wait (timeout = 10 )
375- except subprocess .TimeoutExpired :
376- handle .process .kill ()
377- handle .process .wait ()
378- logger .info ("cloudflared stopped" )
399+ _stop_cloudflared_on_vm (vm )
400+ except Exception :
401+ logger .warning ("Failed to stop cloudflared on VM" , exc_info = True )
379402
380403 if not delete_tunnel :
381404 return
0 commit comments