diff --git a/castai/resource_workload_scaling_policy.go b/castai/resource_workload_scaling_policy.go index 527a31fe..25c72dcf 100644 --- a/castai/resource_workload_scaling_policy.go +++ b/castai/resource_workload_scaling_policy.go @@ -36,6 +36,10 @@ const ( maxExponentValue = 1. minExponentValue = 0. defaultApplyType = "IMMEDIATE" + + // CPU stall defaults + defaultCPUStallMinPressuredPodPct = 50.0 + defaultCPUStallThresholdPct = 10.0 ) const ( @@ -66,6 +70,10 @@ const ( FieldApplyThresholdStrategyDefaultAdaptiveType = "DEFAULT_ADAPTIVE" FieldApplyThresholdStrategyCustomAdaptiveType = "CUSTOM_ADAPTIVE" FieldAssignmentRules = "assignment_rules" + FieldAnomalyDetection = "anomaly_detection" + FieldAnomalyDetectionCpuPressure = "cpu_pressure" + FieldCpuStallThresholdPercentage = "cpu_stall_threshold_percentage" + FieldMinPressuredPodPercentage = "min_pressured_pod_percentage" ) const ( @@ -346,6 +354,41 @@ It can be either: }, }, }, + FieldAnomalyDetection: { + Type: schema.TypeList, + Optional: true, + MaxItems: 1, + Description: "Defines anomaly detection settings for the scaling policy.", + DiffSuppressFunc: func(k, old, new string, d *schema.ResourceData) bool { + return suppressAnomalyDetectionDefaultValueDiff(old, new, d) + }, + Elem: &schema.Resource{ + Schema: map[string]*schema.Schema{ + FieldAnomalyDetectionCpuPressure: { + Type: schema.TypeList, + Optional: true, + MaxItems: 1, + Description: "Configures CPU pressure anomaly detection thresholds.", + Elem: &schema.Resource{ + Schema: map[string]*schema.Schema{ + FieldCpuStallThresholdPercentage: { + Type: schema.TypeFloat, + Required: true, + Description: "Percentage of time (0-100) that a pod must experience CPU pressure to be considered under pressure.", + ValidateDiagFunc: validation.ToDiagFunc(validation.FloatBetween(0, 100)), + }, + FieldMinPressuredPodPercentage: { + Type: schema.TypeFloat, + Required: true, + Description: "Percentage (0-100) of pods that must be experiencing pressure for the detector to trigger.", + ValidateDiagFunc: validation.ToDiagFunc(validation.FloatBetween(0, 100)), + }, + }, + }, + }, + }, + }, + }, }, Timeouts: &schema.ResourceTimeout{ Create: schema.DefaultTimeout(createTimeout), @@ -638,6 +681,8 @@ func resourceWorkloadScalingPolicyCreate(ctx context.Context, d *schema.Resource req.RecommendationPolicies.Jvm = toJvm(toSection(d, FieldJVM)) + req.RecommendationPolicies.AnomalyDetection = toAnomalyDetection(toSection(d, FieldAnomalyDetection)) + req.RecommendationPolicies.ExcludedContainers = toExcludedContainers(d) ar, err := toAssignmentRules(toSection(d, FieldAssignmentRules)) @@ -781,7 +826,9 @@ func fetchScalingPolicy(ctx context.Context, d *schema.ResourceData, meta any) ( if err := d.Set(FieldJVM, toJvmMap(sp.RecommendationPolicies.Jvm)); err != nil { return nil, fmt.Errorf("setting jvm: %w", err) } - + if err := d.Set(FieldAnomalyDetection, toAnomalyDetectionMap(sp.RecommendationPolicies.AnomalyDetection)); err != nil { + return nil, fmt.Errorf("setting anomaly detection: %w", err) + } if err := d.Set(FieldAssignmentRules, toAssignmentRulesMap(getResourceFrom(d, FieldAssignmentRules), sp.AssignmentRules)); err != nil { return nil, fmt.Errorf("setting assignment rules: %w", err) } @@ -820,6 +867,7 @@ func updateScalingPolicy(ctx context.Context, d *schema.ResourceData, meta any) FieldPredictiveScaling, FieldRolloutBehavior, FieldJVM, + FieldAnomalyDetection, FieldExcludedContainers, ) { tflog.Info(ctx, "scaling policy up to date") @@ -857,6 +905,7 @@ func updateScalingPolicy(ctx context.Context, d *schema.ResourceData, meta any) PredictiveScaling: toPredictiveScaling(toSection(d, FieldPredictiveScaling)), RolloutBehavior: toRolloutBehavior(toSection(d, FieldRolloutBehavior)), Jvm: toJvm(toSection(d, FieldJVM)), + AnomalyDetection: toAnomalyDetection(toSection(d, FieldAnomalyDetection)), ExcludedContainers: toExcludedContainers(d), }, } @@ -1118,6 +1167,17 @@ func suppressMemoryEventApplyTypeDefaultValueDiff(oldValue, newValue string, d * return oldValue == newValue } +func suppressAnomalyDetectionDefaultValueDiff(oldValue, newValue string, d *schema.ResourceData) bool { + if isEmpty(newValue) { + cpuStallThreshold := d.Get(fmt.Sprintf("%s.0.%s.0.%s", FieldAnomalyDetection, FieldAnomalyDetectionCpuPressure, FieldCpuStallThresholdPercentage)) + minPressuredPodPct := d.Get(fmt.Sprintf("%s.0.%s.0.%s", FieldAnomalyDetection, FieldAnomalyDetectionCpuPressure, FieldMinPressuredPodPercentage)) + // Suppress diff if the API-returned values equal the defaults (meaning no explicit config is needed) + return cpuStallThreshold == defaultCPUStallThresholdPct && minPressuredPodPct == defaultCPUStallMinPressuredPodPct + } + + return oldValue == newValue +} + func isEmpty(value string) bool { return value == "" || value == "0" } @@ -1492,6 +1552,40 @@ func toRolloutBehaviorMap(s *sdk.WorkloadoptimizationV1RolloutBehaviorSettings) return []map[string]any{m} } +func toAnomalyDetection(m map[string]any) *sdk.WorkloadoptimizationV1AnomalyDetectionSettings { + if len(m) == 0 { + return nil + } + result := &sdk.WorkloadoptimizationV1AnomalyDetectionSettings{} + if cpuPressure := getFirstElem(m, FieldAnomalyDetectionCpuPressure); cpuPressure != nil { + result.CpuPressure = &sdk.WorkloadoptimizationV1CPUPressureSettings{ + // schema already handles type validation, so casting is safe + CpuStallThresholdPercentage: cpuPressure[FieldCpuStallThresholdPercentage].(float64), + MinPressuredPodPercentage: cpuPressure[FieldMinPressuredPodPercentage].(float64), + } + } + return result +} + +func toAnomalyDetectionMap(s *sdk.WorkloadoptimizationV1AnomalyDetectionSettings) []map[string]any { + if s == nil { + return nil + } + m := map[string]any{} + if s.CpuPressure != nil { + m[FieldAnomalyDetectionCpuPressure] = []map[string]any{ + { + FieldCpuStallThresholdPercentage: s.CpuPressure.CpuStallThresholdPercentage, + FieldMinPressuredPodPercentage: s.CpuPressure.MinPressuredPodPercentage, + }, + } + } + if len(m) == 0 { + return nil + } + return []map[string]any{m} +} + func toJvm(m map[string]any) *sdk.WorkloadoptimizationV1JVMSettings { if len(m) == 0 { return nil diff --git a/castai/resource_workload_scaling_policy_test.go b/castai/resource_workload_scaling_policy_test.go index d444cf0d..6d260a72 100644 --- a/castai/resource_workload_scaling_policy_test.go +++ b/castai/resource_workload_scaling_policy_test.go @@ -124,6 +124,8 @@ func TestAccGKE_ResourceWorkloadScalingPolicy(t *testing.T) { // Requires workload-autoscaler from v0.35.3 resource.TestCheckResourceAttr(resourceName, "rollout_behavior.0.type", "NO_DISRUPTION"), resource.TestCheckResourceAttr(resourceName, "jvm.0.memory.0.optimization", "true"), + resource.TestCheckResourceAttr(resourceName, "anomaly_detection.0.cpu_pressure.0.cpu_stall_threshold_percentage", "50"), + resource.TestCheckResourceAttr(resourceName, "anomaly_detection.0.cpu_pressure.0.min_pressured_pod_percentage", "30"), ), }, }, @@ -393,6 +395,12 @@ func scalingPolicyConfigUpdated(clusterName, projectID, name string) string { confidence { threshold = 0.6 } + anomaly_detection { + cpu_pressure { + cpu_stall_threshold_percentage = 50 + min_pressured_pod_percentage = 30 + } + } jvm { memory { optimization = true @@ -837,6 +845,82 @@ func Test_toRolloutBehaviorMap(t *testing.T) { } } +func Test_toAnomalyDetection(t *testing.T) { + tests := map[string]struct { + args map[string]any + exp *sdk.WorkloadoptimizationV1AnomalyDetectionSettings + }{ + "should return nil on empty map": { + args: map[string]any{}, + exp: nil, + }, + "should return anomaly detection settings with cpu_pressure": { + args: map[string]any{ + FieldAnomalyDetectionCpuPressure: []any{ + map[string]any{ + FieldCpuStallThresholdPercentage: float64(50), + FieldMinPressuredPodPercentage: float64(30), + }, + }, + }, + exp: &sdk.WorkloadoptimizationV1AnomalyDetectionSettings{ + CpuPressure: &sdk.WorkloadoptimizationV1CPUPressureSettings{ + CpuStallThresholdPercentage: 50, + MinPressuredPodPercentage: 30, + }, + }, + }, + } + for name, tt := range tests { + t.Run(name, func(t *testing.T) { + r := require.New(t) + got := toAnomalyDetection(tt.args) + r.Equal(tt.exp, got) + }) + } +} + +func Test_toAnomalyDetectionMap(t *testing.T) { + tests := map[string]struct { + args *sdk.WorkloadoptimizationV1AnomalyDetectionSettings + exp []map[string]any + }{ + "should return nil for nil input": { + args: nil, + exp: nil, + }, + "should return anomaly detection map with cpu_pressure": { + args: &sdk.WorkloadoptimizationV1AnomalyDetectionSettings{ + CpuPressure: &sdk.WorkloadoptimizationV1CPUPressureSettings{ + CpuStallThresholdPercentage: 50, + MinPressuredPodPercentage: 30, + }, + }, + exp: []map[string]any{ + { + FieldAnomalyDetectionCpuPressure: []map[string]any{ + { + FieldCpuStallThresholdPercentage: float64(50), + FieldMinPressuredPodPercentage: float64(30), + }, + }, + }, + }, + }, + "should return nil for empty settings": { + args: &sdk.WorkloadoptimizationV1AnomalyDetectionSettings{}, + exp: nil, + }, + } + for name, tt := range tests { + t.Run(name, func(t *testing.T) { + r := require.New(t) + got := toAnomalyDetectionMap(tt.args) + r.Equal(tt.exp, got) + }) + } +} + func Test_toJvm(t *testing.T) { tests := map[string]struct { args map[string]any diff --git a/docs/resources/workload_scaling_policy.md b/docs/resources/workload_scaling_policy.md index 2a3371cc..afa572af 100644 --- a/docs/resources/workload_scaling_policy.md +++ b/docs/resources/workload_scaling_policy.md @@ -88,6 +88,12 @@ resource "castai_workload_scaling_policy" "services" { rollout_behavior { type = "NO_DISRUPTION" } + anomaly_detection { + cpu_pressure { + cpu_stall_threshold_percentage = 50 + min_pressured_pod_percentage = 30 + } + } jvm { memory { optimization = true @@ -115,6 +121,7 @@ resource "castai_workload_scaling_policy" "services" { ### Optional +- `anomaly_detection` (Block List, Max: 1) Defines anomaly detection settings for the scaling policy. (see [below for nested schema](#nestedblock--anomaly_detection)) - `anti_affinity` (Block List, Max: 1) (see [below for nested schema](#nestedblock--anti_affinity)) - `assignment_rules` (Block List) Allows defining conditions for automatically assigning workloads to this scaling policy. (see [below for nested schema](#nestedblock--assignment_rules)) - `confidence` (Block List, Max: 1) Defines the confidence settings for applying recommendations. (see [below for nested schema](#nestedblock--confidence)) @@ -248,6 +255,23 @@ Optional: + +### Nested Schema for `anomaly_detection` + +Optional: + +- `cpu_pressure` (Block List, Max: 1) Configures CPU pressure anomaly detection thresholds. (see [below for nested schema](#nestedblock--anomaly_detection--cpu_pressure)) + + +### Nested Schema for `anomaly_detection.cpu_pressure` + +Required: + +- `cpu_stall_threshold_percentage` (Number) Percentage of time (0-100) that a pod must experience CPU pressure to be considered under pressure. +- `min_pressured_pod_percentage` (Number) Percentage (0-100) of pods that must be experiencing pressure for the detector to trigger. + + + ### Nested Schema for `anti_affinity` diff --git a/examples/resources/castai_workload_scaling_policy/resource.tf b/examples/resources/castai_workload_scaling_policy/resource.tf index bda0b5a4..c0c2c2aa 100644 --- a/examples/resources/castai_workload_scaling_policy/resource.tf +++ b/examples/resources/castai_workload_scaling_policy/resource.tf @@ -71,6 +71,12 @@ resource "castai_workload_scaling_policy" "services" { rollout_behavior { type = "NO_DISRUPTION" } + anomaly_detection { + cpu_pressure { + cpu_stall_threshold_percentage = 50 + min_pressured_pod_percentage = 30 + } + } jvm { memory { optimization = true