Skip to content

Commit fbcdecb

Browse files
authored
Merge pull request #306 from one-covenant/feat/k3s-replace-https-alb
feat(cloud): add HTTPS support for deployments ALB
2 parents 5cbe613 + 61886a3 commit fbcdecb

File tree

6 files changed

+72
-23
lines changed

6 files changed

+72
-23
lines changed

crates/basilica-autoscaler/src/controllers/scaling_policy_controller.rs

Lines changed: 20 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -146,15 +146,12 @@ where
146146
}
147147

148148
// Evaluate scaling decision with GPU-aware idle node matching
149-
let decision = self.evaluate_scaling(
150-
&policy.spec,
151-
&metrics_snapshot,
152-
current_nodes,
153-
&status,
154-
&pending_gpu_pods,
155-
&node_pools,
156-
&gpu_nodes,
157-
);
149+
let ctx = ScalingContext {
150+
pending_pods: &pending_gpu_pods,
151+
node_pools: &node_pools,
152+
k8s_nodes: &gpu_nodes,
153+
};
154+
let decision = self.evaluate_scaling(&policy.spec, &metrics_snapshot, current_nodes, &ctx);
158155

159156
match decision {
160157
ScalingDecision::ScaleUp(count) => {
@@ -717,18 +714,18 @@ where
717714
spec: &ScalingPolicySpec,
718715
metrics: &MetricsSnapshot,
719716
current_nodes: u32,
720-
_status: &ScalingPolicyStatus,
721-
pending_pods: &[Pod],
722-
node_pools: &[NodePool],
723-
k8s_nodes: &[k8s_openapi::api::core::v1::Node],
717+
ctx: &ScalingContext<'_>,
724718
) -> ScalingDecision {
725719
// Check scale up: pending GPU pods exceed threshold
726720
if metrics.pending_gpu_pods >= spec.scale_up.pending_pod_threshold {
727721
// GPU-aware idle check: only skip scale-up if idle nodes can actually serve
728722
// the pending pods based on GPU model, count, and memory requirements.
729723
// Also checks that nodes are schedulable (no disk pressure, etc.)
730-
let serviceable_count =
731-
Self::count_serviceable_pending_pods(pending_pods, node_pools, k8s_nodes);
724+
let serviceable_count = Self::count_serviceable_pending_pods(
725+
ctx.pending_pods,
726+
ctx.node_pools,
727+
ctx.k8s_nodes,
728+
);
732729

733730
if serviceable_count >= metrics.pending_gpu_pods && metrics.idle_nodes > 0 {
734731
debug!(
@@ -1714,6 +1711,14 @@ enum ScalingDecision {
17141711
NoAction,
17151712
}
17161713

1714+
/// Context for scaling evaluation containing node and pod information.
1715+
/// Groups related parameters to reduce function argument count.
1716+
struct ScalingContext<'a> {
1717+
pending_pods: &'a [Pod],
1718+
node_pools: &'a [NodePool],
1719+
k8s_nodes: &'a [k8s_openapi::api::core::v1::Node],
1720+
}
1721+
17171722
/// Result of offering lookup with fallback handling
17181723
enum OfferingResult {
17191724
Found(crate::api::GpuOffering),

orchestrator/cloud/main.tf

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -74,9 +74,11 @@ module "k3s_nlb" {
7474
module "deployments_alb" {
7575
source = "./modules/deployments-alb"
7676

77-
name_prefix = local.name_prefix
78-
vpc_id = module.networking.vpc_id
79-
subnet_ids = module.networking.public_subnet_ids
77+
name_prefix = local.name_prefix
78+
vpc_id = module.networking.vpc_id
79+
subnet_ids = module.networking.public_subnet_ids
80+
enable_https = var.deployments_alb_enable_https
81+
certificate_arn = var.deployments_alb_certificate_arn
8082

8183
tags = local.common_tags
8284

orchestrator/cloud/modules/deployments-alb/main.tf

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ resource "aws_lb" "deployments" {
99
enable_deletion_protection = false
1010
enable_http2 = true
1111
enable_cross_zone_load_balancing = true
12+
idle_timeout = 4000 # Max for ALB (seconds)
1213

1314
tags = merge(var.tags, {
1415
Name = "${var.name_prefix}-deploy-alb"
@@ -57,9 +58,26 @@ resource "aws_lb_listener" "http" {
5758
})
5859
}
5960

60-
# HTTPS listener removed - not needed with Cloudflare Flexible mode
61-
# Cloudflare connects to ALB via HTTP (port 80) only
62-
# If direct HTTPS access to ALB is needed, add an HTTPS listener with a certificate
61+
# HTTPS listener - TLS termination at ALB
62+
# Required when not using Cloudflare proxy (direct ALB access)
63+
resource "aws_lb_listener" "https" {
64+
count = var.enable_https ? 1 : 0
65+
66+
load_balancer_arn = aws_lb.deployments.arn
67+
port = 443
68+
protocol = "HTTPS"
69+
ssl_policy = "ELBSecurityPolicy-TLS13-1-2-2021-06"
70+
certificate_arn = var.certificate_arn
71+
72+
default_action {
73+
type = "forward"
74+
target_group_arn = aws_lb_target_group.envoy.arn
75+
}
76+
77+
tags = merge(var.tags, {
78+
Name = "${var.name_prefix}-https-listener"
79+
})
80+
}
6381

6482
# Security group for ALB
6583
resource "aws_security_group" "alb" {

orchestrator/cloud/modules/deployments-alb/variables.tf

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,15 @@ variable "tags" {
1818
type = map(string)
1919
default = {}
2020
}
21+
22+
variable "certificate_arn" {
23+
description = "ACM certificate ARN for HTTPS listener"
24+
type = string
25+
default = ""
26+
}
27+
28+
variable "enable_https" {
29+
description = "Enable HTTPS listener with TLS termination at ALB"
30+
type = bool
31+
default = false
32+
}

orchestrator/cloud/variables.tf

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,3 +109,15 @@ variable "deployment_public_ip" {
109109
description = "Public IP/DNS for user deployments (defaults to primary K3s server IP, update after Envoy LB is provisioned)"
110110
default = ""
111111
}
112+
113+
variable "deployments_alb_enable_https" {
114+
type = bool
115+
description = "Enable HTTPS listener on deployments ALB with TLS termination"
116+
default = false
117+
}
118+
119+
variable "deployments_alb_certificate_arn" {
120+
type = string
121+
description = "ACM certificate ARN for HTTPS listener on deployments ALB"
122+
default = ""
123+
}

scripts/cloud/compute.tf

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -383,15 +383,15 @@ module "basilica_api_service" {
383383

384384
# Cloudflare Integration
385385
BASILICA_API_DNS__ENABLED = "true"
386-
BASILICA_API_DNS__PROXY = "true"
386+
BASILICA_API_DNS__PROXY = "false"
387387
BASILICA_API_DNS__API_TOKEN = var.cloudflare_api_token
388388
BASILICA_API_DNS__ZONE_ID = var.cloudflare_zone_id
389389
BASILICA_API_DNS__DOMAIN = var.cloudflare_domain
390390
BASILICA_API_DNS__ALB_DNS_NAME = var.deployments_alb_dns_name
391391
CLOUDFLARE_API_TOKEN = var.cloudflare_api_token
392392
CLOUDFLARE_ZONE_ID = var.cloudflare_zone_id
393393
CLOUDFLARE_DOMAIN = var.cloudflare_domain
394-
CLOUDFLARE_PROXY = "true"
394+
CLOUDFLARE_PROXY = "false"
395395
ALB_DNS_NAME = var.deployments_alb_dns_name
396396

397397
# K3S_SERVER_URL for interacting with the cluster

0 commit comments

Comments
 (0)