Merge pull request #306 from one-covenant/feat/k3s-replace-https-alb

epappas · web-flow · commit fbcdecb7c312 · 2025-12-22T22:55:58.000Z
feat(cloud): add HTTPS support for deployments ALB
diff --git a/crates/basilica-autoscaler/src/controllers/scaling_policy_controller.rs b/crates/basilica-autoscaler/src/controllers/scaling_policy_controller.rs
@@ -146,15 +146,12 @@ where
         }
 
         // Evaluate scaling decision with GPU-aware idle node matching
-        let decision = self.evaluate_scaling(
-            &policy.spec,
-            &metrics_snapshot,
-            current_nodes,
-            &status,
-            &pending_gpu_pods,
-            &node_pools,
-            &gpu_nodes,
-        );
+        let ctx = ScalingContext {
+            pending_pods: &pending_gpu_pods,
+            node_pools: &node_pools,
+            k8s_nodes: &gpu_nodes,
+        };
+        let decision = self.evaluate_scaling(&policy.spec, &metrics_snapshot, current_nodes, &ctx);
 
         match decision {
             ScalingDecision::ScaleUp(count) => {
@@ -717,18 +714,18 @@ where
         spec: &ScalingPolicySpec,
         metrics: &MetricsSnapshot,
         current_nodes: u32,
-        _status: &ScalingPolicyStatus,
-        pending_pods: &[Pod],
-        node_pools: &[NodePool],
-        k8s_nodes: &[k8s_openapi::api::core::v1::Node],
+        ctx: &ScalingContext<'_>,
     ) -> ScalingDecision {
         // Check scale up: pending GPU pods exceed threshold
         if metrics.pending_gpu_pods >= spec.scale_up.pending_pod_threshold {
             // GPU-aware idle check: only skip scale-up if idle nodes can actually serve
             // the pending pods based on GPU model, count, and memory requirements.
             // Also checks that nodes are schedulable (no disk pressure, etc.)
-            let serviceable_count =
-                Self::count_serviceable_pending_pods(pending_pods, node_pools, k8s_nodes);
+            let serviceable_count = Self::count_serviceable_pending_pods(
+                ctx.pending_pods,
+                ctx.node_pools,
+                ctx.k8s_nodes,
+            );
 
             if serviceable_count >= metrics.pending_gpu_pods && metrics.idle_nodes > 0 {
                 debug!(
@@ -1714,6 +1711,14 @@ enum ScalingDecision {
     NoAction,
 }
 
+/// Context for scaling evaluation containing node and pod information.
+/// Groups related parameters to reduce function argument count.
+struct ScalingContext<'a> {
+    pending_pods: &'a [Pod],
+    node_pools: &'a [NodePool],
+    k8s_nodes: &'a [k8s_openapi::api::core::v1::Node],
+}
+
 /// Result of offering lookup with fallback handling
 enum OfferingResult {
     Found(crate::api::GpuOffering),
diff --git a/orchestrator/cloud/main.tf b/orchestrator/cloud/main.tf
@@ -74,9 +74,11 @@ module "k3s_nlb" {
 module "deployments_alb" {
   source = "./modules/deployments-alb"
 
-  name_prefix = local.name_prefix
-  vpc_id      = module.networking.vpc_id
-  subnet_ids  = module.networking.public_subnet_ids
+  name_prefix     = local.name_prefix
+  vpc_id          = module.networking.vpc_id
+  subnet_ids      = module.networking.public_subnet_ids
+  enable_https    = var.deployments_alb_enable_https
+  certificate_arn = var.deployments_alb_certificate_arn
 
   tags = local.common_tags
 
diff --git a/orchestrator/cloud/modules/deployments-alb/main.tf b/orchestrator/cloud/modules/deployments-alb/main.tf
@@ -9,6 +9,7 @@ resource "aws_lb" "deployments" {
   enable_deletion_protection       = false
   enable_http2                     = true
   enable_cross_zone_load_balancing = true
+  idle_timeout                     = 4000 # Max for ALB (seconds)
 
   tags = merge(var.tags, {
     Name = "${var.name_prefix}-deploy-alb"
@@ -57,9 +58,26 @@ resource "aws_lb_listener" "http" {
   })
 }
 
-# HTTPS listener removed - not needed with Cloudflare Flexible mode
-# Cloudflare connects to ALB via HTTP (port 80) only
-# If direct HTTPS access to ALB is needed, add an HTTPS listener with a certificate
+# HTTPS listener - TLS termination at ALB
+# Required when not using Cloudflare proxy (direct ALB access)
+resource "aws_lb_listener" "https" {
+  count = var.enable_https ? 1 : 0
+
+  load_balancer_arn = aws_lb.deployments.arn
+  port              = 443
+  protocol          = "HTTPS"
+  ssl_policy        = "ELBSecurityPolicy-TLS13-1-2-2021-06"
+  certificate_arn   = var.certificate_arn
+
+  default_action {
+    type             = "forward"
+    target_group_arn = aws_lb_target_group.envoy.arn
+  }
+
+  tags = merge(var.tags, {
+    Name = "${var.name_prefix}-https-listener"
+  })
+}
 
 # Security group for ALB
 resource "aws_security_group" "alb" {
diff --git a/orchestrator/cloud/modules/deployments-alb/variables.tf b/orchestrator/cloud/modules/deployments-alb/variables.tf
@@ -18,3 +18,15 @@ variable "tags" {
   type        = map(string)
   default     = {}
 }
+
+variable "certificate_arn" {
+  description = "ACM certificate ARN for HTTPS listener"
+  type        = string
+  default     = ""
+}
+
+variable "enable_https" {
+  description = "Enable HTTPS listener with TLS termination at ALB"
+  type        = bool
+  default     = false
+}
diff --git a/orchestrator/cloud/variables.tf b/orchestrator/cloud/variables.tf
@@ -109,3 +109,15 @@ variable "deployment_public_ip" {
   description = "Public IP/DNS for user deployments (defaults to primary K3s server IP, update after Envoy LB is provisioned)"
   default     = ""
 }
+
+variable "deployments_alb_enable_https" {
+  type        = bool
+  description = "Enable HTTPS listener on deployments ALB with TLS termination"
+  default     = false
+}
+
+variable "deployments_alb_certificate_arn" {
+  type        = string
+  description = "ACM certificate ARN for HTTPS listener on deployments ALB"
+  default     = ""
+}
diff --git a/scripts/cloud/compute.tf b/scripts/cloud/compute.tf
@@ -383,15 +383,15 @@ module "basilica_api_service" {
 
     # Cloudflare Integration
     BASILICA_API_DNS__ENABLED      = "true"
-    BASILICA_API_DNS__PROXY        = "true"
+    BASILICA_API_DNS__PROXY        = "false"
     BASILICA_API_DNS__API_TOKEN    = var.cloudflare_api_token
     BASILICA_API_DNS__ZONE_ID      = var.cloudflare_zone_id
     BASILICA_API_DNS__DOMAIN       = var.cloudflare_domain
     BASILICA_API_DNS__ALB_DNS_NAME = var.deployments_alb_dns_name
     CLOUDFLARE_API_TOKEN           = var.cloudflare_api_token
     CLOUDFLARE_ZONE_ID             = var.cloudflare_zone_id
     CLOUDFLARE_DOMAIN              = var.cloudflare_domain
-    CLOUDFLARE_PROXY               = "true"
+    CLOUDFLARE_PROXY               = "false"
     ALB_DNS_NAME                   = var.deployments_alb_dns_name
 
     # K3S_SERVER_URL for interacting with the cluster