diff --git a/service-mesh/Dockerfile b/service-mesh/Dockerfile new file mode 100644 index 0000000..95b523e --- /dev/null +++ b/service-mesh/Dockerfile @@ -0,0 +1,33 @@ +# Build stage: Compile dstack-mesh +FROM rust:1.86-alpine AS rust-builder +RUN apk add --no-cache musl-dev +RUN rustup target add x86_64-unknown-linux-musl +WORKDIR /build +COPY . /build +RUN cargo build --release --target x86_64-unknown-linux-musl + +# Final stage: Alpine with bash, curl, jq +FROM alpine:3.20 + +# Install required tools +RUN apk add --no-cache \ + bash \ + curl \ + jq \ + ca-certificates + +# Copy dstack-mesh binary (statically linked musl binary) +COPY --from=rust-builder /build/target/x86_64-unknown-linux-musl/release/dstack-mesh /usr/local/bin/dstack-mesh + +# Create necessary directories +RUN mkdir -p /etc/dstack /etc/ssl/certs /etc/ssl/private /tmp /var/lib/dstack-mesh + +# Copy entrypoint script +COPY entrypoint.sh /entrypoint.sh +RUN chmod +x /entrypoint.sh + +WORKDIR /app + +EXPOSE 8091 8092 + +ENTRYPOINT ["/entrypoint.sh"] diff --git a/service-mesh/entrypoint.sh b/service-mesh/entrypoint.sh new file mode 100644 index 0000000..12c5420 --- /dev/null +++ b/service-mesh/entrypoint.sh @@ -0,0 +1,176 @@ +#!/bin/bash +set -e + +# ============================================================================ +# Environment Variables Validation +# ============================================================================ +if [ -z "$DSTACK_GATEWAY_DOMAIN" ]; then + echo "ERROR: DSTACK_GATEWAY_DOMAIN environment variable is not set" + exit 1 +fi + +# ============================================================================ +# Initialization State Check +# ============================================================================ +DATA_DIR="/var/lib/dstack-mesh" +STATE_FILE="$DATA_DIR/.service-mesh-state" +CONFIG_FILE="/etc/dstack/dstack-mesh.toml" +CERT_KEY="/etc/ssl/private/server.key" +CERT_FILE="/etc/ssl/certs/server.crt" +CA_FILE="/etc/ssl/certs/ca.crt" + +# Create data directory if it doesn't exist +mkdir -p "$DATA_DIR" + +# Check if we need to initialize +NEEDS_INIT=true + +if [ -f "$STATE_FILE" ]; then + echo "Found existing state file, checking..." + + # Read saved state + SAVED_GATEWAY_DOMAIN=$(grep "DSTACK_GATEWAY_DOMAIN=" "$STATE_FILE" | cut -d'=' -f2-) + + echo " Saved DSTACK_GATEWAY_DOMAIN: $SAVED_GATEWAY_DOMAIN" + echo " Current DSTACK_GATEWAY_DOMAIN: $DSTACK_GATEWAY_DOMAIN" + + # Check if DSTACK_GATEWAY_DOMAIN is the same + if [ "$SAVED_GATEWAY_DOMAIN" = "$DSTACK_GATEWAY_DOMAIN" ]; then + # Check if all required files exist in volume + if [ -f "$DATA_DIR/dstack-mesh.toml" ] && \ + [ -f "$DATA_DIR/server.key" ] && \ + [ -f "$DATA_DIR/server.crt" ] && \ + [ -f "$DATA_DIR/ca.crt" ]; then + echo "✓ Configuration is up-to-date, skipping initialization" + NEEDS_INIT=false + else + echo "⚠ Some configuration files are missing, re-initializing..." + fi + else + echo "⚠ DSTACK_GATEWAY_DOMAIN changed, re-initializing..." + fi +else + echo "No previous state found, initializing..." +fi + +# ============================================================================ +# Phase 1: Generate Configuration and Certificates (if needed) +# ============================================================================ +if [ "$NEEDS_INIT" = true ]; then + echo "==========================================" + echo "dstack-mesh Bootstrap" + echo " Gateway Domain: $DSTACK_GATEWAY_DOMAIN" + echo "==========================================" + + # Generate dstack-mesh.toml configuration + echo "Generating dstack-mesh configuration..." + cat > "$DATA_DIR/dstack-mesh.toml" < /tmp/server_response.json; then + echo "ERROR: Failed to generate certificates - dstack.sock may not be available" + exit 1 + fi + + # Validate JSON response + if ! jq -e . /tmp/server_response.json >/dev/null 2>&1; then + echo "ERROR: Invalid JSON response from dstack.sock" + exit 1 + fi + + # Extract server key and certificates to volume + echo "Extracting server key and certificates..." + jq -r '.key' /tmp/server_response.json > "$DATA_DIR/server.key" + jq -r '.certificate_chain[]' /tmp/server_response.json > "$DATA_DIR/server.crt" + jq -r '.certificate_chain[-1]' /tmp/server_response.json > "$DATA_DIR/ca.crt" + + # Verify certificates were created + if [ ! -f "$DATA_DIR/server.key" ] || [ ! -s "$DATA_DIR/server.key" ]; then + echo "ERROR: Failed to extract server key" + exit 1 + fi + + if [ ! -f "$DATA_DIR/server.crt" ] || [ ! -s "$DATA_DIR/server.crt" ]; then + echo "ERROR: Failed to extract server certificate" + exit 1 + fi + + if [ ! -f "$DATA_DIR/ca.crt" ] || [ ! -s "$DATA_DIR/ca.crt" ]; then + echo "ERROR: Failed to extract CA certificate" + exit 1 + fi + + echo "✓ Certificates generated and saved to $DATA_DIR" + + # Save state + cat > "$STATE_FILE" < /dev/null || tailscale status --json | jq -e '.BackendState == "Running"' > /dev/null || exit 1 + +ENTRYPOINT ["/entrypoint.sh"] diff --git a/vpc-node/entrypoint.sh b/vpc-node/entrypoint.sh new file mode 100644 index 0000000..a19736b --- /dev/null +++ b/vpc-node/entrypoint.sh @@ -0,0 +1,199 @@ +#!/bin/sh +set -e + +# ============================================================================ +# Environment Variables Validation +# ============================================================================ +if [ -z "$NODE_NAME" ]; then + echo "ERROR: NODE_NAME environment variable is not set" + exit 1 +fi + +if [ -z "$VPC_SERVER_APP_ID" ]; then + echo "ERROR: VPC_SERVER_APP_ID environment variable is not set" + exit 1 +fi + +# Set default DSTACK_MESH_URL if not provided +DSTACK_MESH_URL=${DSTACK_MESH_URL:-"http://localhost:8091"} +VPC_SERVER_PORT=${VPC_SERVER_PORT:-"443"} + +# ============================================================================ +# Initialization State Check +# ============================================================================ +DATA_DIR="/var/lib/vpc-node" +STATE_FILE="$DATA_DIR/.vpc-node-state" +CONFIG_FILE="/etc/tailscale/config.json" + +# Create data directory if it doesn't exist +mkdir -p "$DATA_DIR" + +# Check if we need to initialize +NEEDS_INIT=true + +if [ -f "$STATE_FILE" ]; then + echo "Found existing state file, checking..." + + # Read saved state + SAVED_NODE_NAME=$(grep "NODE_NAME=" "$STATE_FILE" | cut -d'=' -f2-) + SAVED_VPC_SERVER_APP_ID=$(grep "VPC_SERVER_APP_ID=" "$STATE_FILE" | cut -d'=' -f2-) + + echo " Saved NODE_NAME: $SAVED_NODE_NAME" + echo " Current NODE_NAME: $NODE_NAME" + + # Check if NODE_NAME or VPC_SERVER_APP_ID changed + if [ "$SAVED_NODE_NAME" = "$NODE_NAME" ] && [ "$SAVED_VPC_SERVER_APP_ID" = "$VPC_SERVER_APP_ID" ]; then + # Check if all required files exist + if [ -f "$DATA_DIR/config.json" ] && \ + [ -f "$DATA_DIR/pre_auth_key" ] && \ + [ -f "$DATA_DIR/server_url" ]; then + echo "✓ Configuration is up-to-date, skipping initialization" + NEEDS_INIT=false + else + echo "⚠ Some configuration files are missing, re-initializing..." + fi + else + echo "⚠ NODE_NAME or VPC_SERVER_APP_ID changed, re-initializing..." + fi +else + echo "No previous state found, initializing..." +fi + +# ============================================================================ +# Phase 1: Bootstrap - Fetch VPC credentials (if needed) +# ============================================================================ +if [ "$NEEDS_INIT" = true ]; then + echo "==========================================" + echo "VPC Node Bootstrap" + echo " Node Name: $NODE_NAME" + echo " VPC Server App ID: $VPC_SERVER_APP_ID" + echo " VPC Server PORT: $VPC_SERVER_PORT" + echo " Mesh URL: $DSTACK_MESH_URL" + echo "==========================================" + + echo "Fetching instance info from dstack-mesh..." + INFO=$(curl -s "$DSTACK_MESH_URL/info") + INSTANCE_ID=$(echo "$INFO" | jq -r .instance_id) + + if [ -z "$INSTANCE_ID" ] || [ "$INSTANCE_ID" = "null" ]; then + echo "ERROR: Failed to get instance_id from mesh" + echo "Response: $INFO" + exit 1 + fi + + echo "Instance ID: $INSTANCE_ID" + + echo "Registering with VPC server..." + echo " URL: $DSTACK_MESH_URL/api/register?instance_id=$INSTANCE_ID&node_name=$NODE_NAME" + echo " x-dstack-target-app: $VPC_SERVER_APP_ID" + + RESPONSE=$(curl -s -w "\nHTTP_CODE:%{http_code}" \ + -H "x-dstack-target-app: $VPC_SERVER_APP_ID" \ + -H "x-dstack-target-port: $VPC_SERVER_PORT" \ + -H "Host: dstack-vpc-server" \ + "$DSTACK_MESH_URL/api/register?instance_id=$INSTANCE_ID&node_name=$NODE_NAME") + + # Extract HTTP code and body + HTTP_CODE=$(echo "$RESPONSE" | grep "HTTP_CODE:" | cut -d':' -f2) + BODY=$(echo "$RESPONSE" | sed '/HTTP_CODE:/d') + + echo "Response Status: $HTTP_CODE" + + # Check HTTP status code + if [ "$HTTP_CODE" != "200" ]; then + echo "ERROR: HTTP request failed with status $HTTP_CODE" + exit 1 + fi + + # Check if response is valid JSON + if ! echo "$BODY" | jq -e . >/dev/null 2>&1; then + echo "ERROR: Response is not valid JSON" + exit 1 + fi + + PRE_AUTH_KEY=$(echo "$BODY" | jq -r .pre_auth_key) + SHARED_KEY=$(echo "$BODY" | jq -r .shared_key) + VPC_SERVER_URL=$(echo "$BODY" | jq -r .server_url) + + if [ -z "$PRE_AUTH_KEY" ] || [ "$PRE_AUTH_KEY" = "null" ] || \ + [ -z "$VPC_SERVER_URL" ] || [ "$VPC_SERVER_URL" = "null" ]; then + echo "ERROR: Missing required fields in registration response" + exit 1 + fi + + echo "✓ Registration successful" + + # ======================================================================== + # Phase 2: Generate and save configuration + # ======================================================================== + echo "Generating tailscaled config file..." + cat > "$DATA_DIR/config.json" < "$DATA_DIR/pre_auth_key" + echo "$SHARED_KEY" > "$DATA_DIR/shared_key" + echo "$VPC_SERVER_URL" > "$DATA_DIR/server_url" + + # Save state + cat > "$STATE_FILE" </dev/null || true +cp "$DATA_DIR/shared_key" /shared/shared_key 2>/dev/null || true +cp "$DATA_DIR/server_url" /shared/server_url 2>/dev/null || true + +echo "✓ Config file ready: $CONFIG_FILE" + +# ============================================================================ +# Phase 4: Start tailscaled with config file (foreground) +# ============================================================================ +TUN_DEV_NAME=${TUN_DEV_NAME:-"tailscale0"} +DEBUG_ADDR=${DEBUG_ADDR:-"127.0.0.1:9002"} + +echo "==========================================" +echo "Starting tailscaled with config file" +echo " Config: $CONFIG_FILE" +echo " TUN Device: $TUN_DEV_NAME" +echo " State Directory: $DATA_DIR" +echo " Debug Server: $DEBUG_ADDR (/debug/metrics)" +echo "==========================================" + +# Trap signals for graceful shutdown +trap 'echo "Received shutdown signal, exiting..."; exit 0' TERM INT + +# Start tailscaled in foreground (becomes PID 1) +# It will handle connection, reconnection, and state management +# Use --statedir instead of --state to support network-lock and other features +exec tailscaled \ + --config="$CONFIG_FILE" \ + --tun="$TUN_DEV_NAME" \ + --statedir="$DATA_DIR" \ + --socket=/var/run/tailscale/tailscaled.sock \ + --debug="$DEBUG_ADDR"