@@ -203,7 +203,7 @@ jobs:
203203 needs : [gate, build]
204204 if : ${{ needs.gate.outputs.approved == 'true' }}
205205 runs-on : [self-hosted, ibm-e2e]
206- timeout-minutes : 180
206+ timeout-minutes : 210
207207 container :
208208 image : golang:1.24.6
209209 options : --user 0
@@ -272,28 +272,28 @@ jobs:
272272 kubectl delete ibmnodeclasses -l test=e2e --timeout=300s || true
273273
274274 echo "⏳ Waiting for cluster stabilization..."
275- for i in {1..20 }; do
275+ for i in {1..30 }; do
276276 pending_pods=$(kubectl get pods -l test=e2e --all-namespaces --field-selector=status.phase=Pending --no-headers 2>/dev/null | wc -l)
277277 if [ "$pending_pods" -eq 0 ]; then
278278 echo "✅ No pending e2e pods found"
279279 break
280280 fi
281281 echo "⏳ Still have $pending_pods pending e2e pods, waiting..."
282- sleep 5
282+ sleep 10
283283 done
284284
285- for i in {1..20 }; do
285+ for i in {1..30 }; do
286286 disrupted_nodes=$(kubectl get nodes --no-headers -o custom-columns="NAME:.metadata.name,TAINTS:.spec.taints[*].key" | grep -c "karpenter.sh/disrupted" 2>/dev/null || echo "0")
287287 disrupted_nodes=$(echo "$disrupted_nodes" | tr -d '\n' | grep -o '[0-9]*' || echo "0")
288288 if [ "$disrupted_nodes" -eq 0 ]; then
289289 echo "✅ No disrupted nodes found"
290290 break
291291 fi
292292 echo "⏳ Still have $disrupted_nodes disrupted nodes, waiting..."
293- sleep 5
293+ sleep 10
294294 done
295295
296- sleep 15
296+ sleep 30
297297 echo "✅ Pre-test cleanup completed"
298298
299299 - name : Run E2E tests
@@ -317,45 +317,49 @@ jobs:
317317 run : |
318318 echo "🚀 Starting E2E test suite..."
319319
320- # Define test groups - optimized for faster execution
321- # Critical path tests (must run)
322- critical_tests ="TestE2EFullWorkflow TestE2EImageSelector TestE2EDriftStability"
320+ # Define test groups
321+ # Core functionality tests from basic_workflow_test.go
322+ core_tests ="TestE2EFullWorkflow TestE2ENodePoolInstanceTypeSelection TestE2EInstanceTypeSelection TestE2EDriftStability"
323323
324- # Validation tests (combine similar tests)
325- validation_tests="TestE2ENodeClassValidation TestE2ENodeClassWithMissingFields"
324+ # NodeClass validation tests from validation_test.go
325+ validation_tests="TestE2ENodeClassValidation TestE2EValidNodeClassCreation TestE2ENodeClassWithMissingFields"
326326
327- # Block device tests (keep both for coverage)
327+ # Block device mapping tests from block_device_test.go
328328 block_device_tests="TestE2EBlockDeviceMapping TestE2EBlockDeviceMappingValidation"
329329
330- # Essential scheduling tests (reduce redundant taint tests)
331- scheduling_tests="TestE2EPodDisruptionBudget TestE2EPodAntiAffinity TestE2ENodeAffinity TestE2ETaintsBasicScheduling TestE2ETaintSync"
330+ # Scheduling constraint tests from scheduling_test.go and e2e_taints_test.go
331+ scheduling_tests="TestE2EPodDisruptionBudget TestE2EConsolidationWithPDB TestE2EPodAntiAffinity TestE2ENodeAffinity TestE2EStartupTaints TestE2EStartupTaintsRemoval TestE2ETaintsBasicScheduling TestE2ETaintValues TestE2ETaintSync TestE2EUnregisteredTaintHandling "
332332
333- # UserData tests (keep both variants)
333+ # UserData feature tests from userdata_test.go
334334 userdata_tests="TestE2EUserDataAppend TestE2EStandardBootstrap"
335335
336- # Multi-zone tests (reduce to essential tests)
337- multizone_tests="TestE2EMultiZoneDistribution TestE2ETopologySpreadConstraints TestE2EZoneFailover "
336+ # Image selector tests from image_selector_test.go
337+ image_selector_tests="TestE2EImageSelector "
338338
339- # Cleanup tests (combine related cleanup)
340- cleanup_tests="TestE2ECleanupNodePoolDeletion TestE2ECleanupOrphanedResources"
339+ # Multi-zone tests from multizone_test.go
340+ multizone_tests="TestE2EMultiZoneDistribution TestE2EZoneAntiAffinity TestE2ETopologySpreadConstraints TestE2EPlacementStrategyValidation TestE2EZoneFailover"
341+
342+ # Cleanup tests from cleanup_test.go
343+ cleanup_tests="TestE2ECleanupNodePoolDeletion TestE2ECleanupNodeClassDeletion TestE2ECleanupOrphanedResources TestE2ECleanupIBMCloudResources"
341344
342345 # Combine all tests
343- all_tests="$critical_tests $validation_tests $block_device_tests $scheduling_tests $userdata_tests $multizone_tests $cleanup_tests"
346+ all_tests="$core_tests $validation_tests $block_device_tests $scheduling_tests $userdata_tests $image_selector_tests $multizone_tests $cleanup_tests"
344347
345348 test_failed="false"
346349 passed_tests=0
347350 failed_tests=0
348351 total_tests=$(echo $all_tests | wc -w)
349352
350- echo "📋 Test Suite Summary (Optimized) :"
351- echo " Critical Tests: $(echo $critical_tests | wc -w)"
353+ echo "📋 Test Suite Summary:"
354+ echo " Core Tests: $(echo $core_tests | wc -w)"
352355 echo " Validation Tests: $(echo $validation_tests | wc -w)"
353356 echo " Block Device Tests: $(echo $block_device_tests | wc -w)"
354357 echo " Scheduling Tests: $(echo $scheduling_tests | wc -w)"
355358 echo " UserData Tests: $(echo $userdata_tests | wc -w)"
359+ echo " Image Selector Tests: $(echo $image_selector_tests | wc -w)"
356360 echo " Multi-Zone Tests: $(echo $multizone_tests | wc -w)"
357361 echo " Cleanup Tests: $(echo $cleanup_tests | wc -w)"
358- echo " Total Tests: $total_tests (reduced from 31) "
362+ echo " Total Tests: $total_tests"
359363 echo ""
360364
361365 for test in $all_tests; do
@@ -365,42 +369,29 @@ jobs:
365369 echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
366370
367371 # Set appropriate timeout based on test type
368- timeout="15m "
372+ timeout="20m "
369373 case "$test" in
370374 "TestE2EDriftStability")
371- timeout="25m " # Drift test needs more time for monitoring
375+ timeout="30m " # Drift test needs more time for monitoring
372376 ;;
373377 "TestE2EMultiZone"*|"TestE2EZone"*|"TestE2ETopology"*|"TestE2EPlacementStrategy"*)
374- timeout="20m " # Multi-zone tests need extra time for cross-zone provisioning
378+ timeout="25m " # Multi-zone tests need extra time for cross-zone provisioning
375379 ;;
376380 "TestE2ECleanup"*)
377- timeout="12m " # Cleanup tests are typically faster
381+ timeout="15m " # Cleanup tests are typically faster
378382 ;;
379383 "TestE2EValidation"*|"TestE2ENodeClass"*)
380- timeout="8m" # Validation tests are quick
381- ;;
382- "TestE2EImageSelector")
383- timeout="12m" # Image selector test optimized
384+ timeout="10m" # Validation tests are quick
384385 ;;
385386 *)
386- timeout="15m " # Default timeout for other tests
387+ timeout="20m " # Default timeout for other tests
387388 ;;
388389 esac
389390
390391 # Create test-specific log file to capture all output
391392 test_log="test-artifacts/${test}-$(date +%s).log"
392393 mkdir -p test-artifacts
393394
394- # Check circuit breaker status before running test
395- circuit_breaker_check=$(kubectl logs -n karpenter -l app.kubernetes.io/name=karpenter --tail=100 2>/dev/null | grep -c "circuit breaker is OPEN" || echo "0")
396- if [ "$circuit_breaker_check" -gt 0 ]; then
397- echo "⚠️ Circuit breaker detected as OPEN, attempting reset..."
398- # Restart Karpenter to reset circuit breaker
399- kubectl rollout restart deployment/karpenter-karpenter-ibm -n karpenter
400- kubectl rollout status deployment/karpenter-karpenter-ibm -n karpenter --timeout=60s
401- sleep 10
402- fi
403-
404395 # Run test with enhanced logging and crash recovery
405396 set +e # Don't exit on failure
406397 timeout $timeout go test -tags=e2e -v -timeout $timeout ./test/e2e -run "^$test$" -count=1 2>&1 | tee "$test_log"
@@ -451,7 +442,7 @@ jobs:
451442 kubectl delete ibmnodeclasses -l test=e2e --timeout=300s || true
452443
453444 echo "⏳ Waiting for cleanup to complete..."
454- sleep 15
445+ sleep 30
455446
456447 kubectl get nodes --no-headers | grep -c Ready | xargs echo "Ready nodes:"
457448 kubectl get nodeclaims --no-headers | grep -c True | xargs echo "Ready nodeclaims:" || echo "Ready nodeclaims: 0"
0 commit comments