Skip to content

Commit 366294a

Browse files
authored
Merge pull request #327 from pfeifferj/feat/improved-multi-zone
feat(vpc): imrpoved multi zone
2 parents 7dd41f8 + 1843eea commit 366294a

File tree

21 files changed

+2079
-107
lines changed

21 files changed

+2079
-107
lines changed

.github/workflows/e2e-tests-in-cluster.yaml

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -177,11 +177,14 @@ jobs:
177177
# Image selector tests from image_selector_test.go
178178
image_selector_tests="TestE2EImageSelector"
179179
180+
# Multi-zone tests from multizone_test.go
181+
multizone_tests="TestE2EMultiZoneDistribution TestE2EZoneAntiAffinity TestE2ETopologySpreadConstraints TestE2EPlacementStrategyValidation TestE2EZoneFailover"
182+
180183
# Cleanup tests from cleanup_test.go
181184
cleanup_tests="TestE2ECleanupNodePoolDeletion TestE2ECleanupNodeClassDeletion TestE2ECleanupOrphanedResources TestE2ECleanupIBMCloudResources"
182185
183186
# Combine all tests
184-
all_tests="$core_tests $validation_tests $block_device_tests $scheduling_tests $userdata_tests $image_selector_tests $cleanup_tests"
187+
all_tests="$core_tests $validation_tests $block_device_tests $scheduling_tests $userdata_tests $image_selector_tests $multizone_tests $cleanup_tests"
185188
186189
test_failed="false"
187190
passed_tests=0
@@ -195,6 +198,7 @@ jobs:
195198
echo " Scheduling Tests: $(echo $scheduling_tests | wc -w)"
196199
echo " UserData Tests: $(echo $userdata_tests | wc -w)"
197200
echo " Image Selector Tests: $(echo $image_selector_tests | wc -w)"
201+
echo " Multi-Zone Tests: $(echo $multizone_tests | wc -w)"
198202
echo " Cleanup Tests: $(echo $cleanup_tests | wc -w)"
199203
echo " Total Tests: $total_tests"
200204
echo ""
@@ -212,6 +216,9 @@ jobs:
212216
"TestE2EDriftStability")
213217
timeout="30m" # Drift test needs more time for monitoring
214218
;;
219+
"TestE2EMultiZone"*|"TestE2EZone"*|"TestE2ETopology"*|"TestE2EPlacementStrategy"*)
220+
timeout="25m" # Multi-zone tests need extra time for cross-zone provisioning
221+
;;
215222
"TestE2ECleanup"*)
216223
timeout="15m" # Cleanup tests are typically faster
217224
;;

.github/workflows/e2e-tests-pr.yml

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -336,11 +336,14 @@ jobs:
336336
# Image selector tests from image_selector_test.go
337337
image_selector_tests="TestE2EImageSelector"
338338
339+
# Multi-zone tests from multizone_test.go
340+
multizone_tests="TestE2EMultiZoneDistribution TestE2EZoneAntiAffinity TestE2ETopologySpreadConstraints TestE2EPlacementStrategyValidation TestE2EZoneFailover"
341+
339342
# Cleanup tests from cleanup_test.go
340343
cleanup_tests="TestE2ECleanupNodePoolDeletion TestE2ECleanupNodeClassDeletion TestE2ECleanupOrphanedResources TestE2ECleanupIBMCloudResources"
341344
342345
# Combine all tests
343-
all_tests="$core_tests $validation_tests $block_device_tests $scheduling_tests $userdata_tests $image_selector_tests $cleanup_tests"
346+
all_tests="$core_tests $validation_tests $block_device_tests $scheduling_tests $userdata_tests $image_selector_tests $multizone_tests $cleanup_tests"
344347
345348
test_failed="false"
346349
passed_tests=0
@@ -354,6 +357,7 @@ jobs:
354357
echo " Scheduling Tests: $(echo $scheduling_tests | wc -w)"
355358
echo " UserData Tests: $(echo $userdata_tests | wc -w)"
356359
echo " Image Selector Tests: $(echo $image_selector_tests | wc -w)"
360+
echo " Multi-Zone Tests: $(echo $multizone_tests | wc -w)"
357361
echo " Cleanup Tests: $(echo $cleanup_tests | wc -w)"
358362
echo " Total Tests: $total_tests"
359363
echo ""
@@ -370,6 +374,9 @@ jobs:
370374
"TestE2EDriftStability")
371375
timeout="30m" # Drift test needs more time for monitoring
372376
;;
377+
"TestE2EMultiZone"*|"TestE2EZone"*|"TestE2ETopology"*|"TestE2EPlacementStrategy"*)
378+
timeout="25m" # Multi-zone tests need extra time for cross-zone provisioning
379+
;;
373380
"TestE2ECleanup"*)
374381
timeout="15m" # Cleanup tests are typically faster
375382
;;

charts/crds/karpenter.sh_nodeoverlays.yaml

Lines changed: 214 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

charts/templates/deployment.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,10 @@ spec:
9797
value: {{ .Values.controller.metrics.port | quote }}
9898
- name: HEALTH_PROBE_PORT
9999
value: {{ .Values.controller.healthProbe.port | quote }}
100+
{{- if .Values.controller.orphanCleanup.enabled }}
101+
- name: KARPENTER_ENABLE_ORPHAN_CLEANUP
102+
value: "true"
103+
{{- end }}
100104
{{- if .Values.customResources.enabled }}
101105
{{- include "karpenter.cr.environmentConfig" . | nindent 12 }}
102106
{{- include "karpenter.cr.bootstrapConfig" . | nindent 12 }}

charts/values.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -453,6 +453,12 @@ additionalCAs:
453453

454454
# Controller configuration
455455
controller:
456+
# Orphan cleanup configuration
457+
orphanCleanup:
458+
# Enable cleanup of orphaned instances (instances without NodeClaims)
459+
# This prevents resource leaks when NodeClaims are deleted before instances are cleaned up
460+
enabled: true
461+
456462
metrics:
457463
port: 8080
458464
healthProbe:

cmd/controller/main.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@ func main() {
9393
op.UnavailableOfferings,
9494
cloudProvider,
9595
op.ProviderFactory.GetInstanceTypeProvider(),
96+
op.ProviderFactory.GetSubnetProvider(),
9697
op.ProviderFactory.GetClient(),
9798
)...).
9899
Start(ctx)

docs/getting-started.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ ibmcloud is images --visibility public --status available | grep ubuntu
8585
- **API Server Endpoint**: Get your cluster's API endpoint (e.g., `https://10.240.0.1:6443`)
8686
- **Region**: `us-south` (or your preferred region)
8787
- **Zone**: `us-south-1` (subnet's availability zone)
88+
- 💡 **Multi-Zone Tip**: For production deployments, consider using `placementStrategy` instead of explicit zones for automatic multi-zone distribution. See [Multi-Zone VPC Setup with Placement Constraints](vpc-integration.md#multi-zone-vpc-setup-with-placement-constraints)
8889

8990
## Installation
9091

0 commit comments

Comments
 (0)