Skip to content

Commit 5bd384e

Browse files
authored
feat(monitor): add ephemeral storage charge threshold monitoring (#6598)
Add support for monitoring and charging ephemeral storage usage beyond a configurable threshold. Pods with ephemeral storage exceeding the threshold (default 10Gi) will be charged for the excess amount. Changes: - Add EPHEMERAL_STORAGE_CHARGE_THRESHOLD environment variable (default: 10Gi) - Track ephemeral storage usage per pod from container limits/requests - Apply threshold deduction and charge only for excess usage - Aggregate ephemeral storage charges into Storage resource metrics 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> # Conflicts: # controllers/resources/controllers/monitor_controller.go
1 parent 5febfce commit 5bd384e

File tree

1 file changed

+15
-3
lines changed

1 file changed

+15
-3
lines changed

controllers/resources/controllers/monitor_controller.go

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -111,9 +111,10 @@ type gpuNodeConfig struct {
111111
}
112112

113113
const (
114-
PrometheusURL = "PROM_URL"
115-
ObjectStorageInstance = "OBJECT_STORAGE_INSTANCE"
116-
ConcurrentLimit = "CONCURRENT_LIMIT"
114+
PrometheusURL = "PROM_URL"
115+
ObjectStorageInstance = "OBJECT_STORAGE_INSTANCE"
116+
ConcurrentLimit = "CONCURRENT_LIMIT"
117+
envEphemeralStorageChargeThreshold = "EPHEMERAL_STORAGE_CHARGE_THRESHOLD"
117118
)
118119

119120
const (
@@ -122,6 +123,7 @@ const (
122123
)
123124

124125
var concurrentLimit = int64(DefaultConcurrencyLimit)
126+
var ephemeralStorageChargeThreshold = resource.MustParse(env.GetEnvWithDefault(envEphemeralStorageChargeThreshold, "10Gi"))
125127

126128
const (
127129
DefaultConcurrencyLimit = 1000
@@ -483,6 +485,7 @@ func (r *MonitorReconciler) monitorPodResourceUsage(namespace string, resUsed ma
483485
r.Logger.Error(err, "get gpu config failed", "pod", pod.Name, "namespace", pod.Namespace, "node", pod.Spec.NodeName)
484486
}
485487
}
488+
podEphemeralStorage := resource.NewQuantity(0, resource.BinarySI)
486489
// skip pods that do not start for more than 1 minute
487490
skip := pod.Status.Phase != corev1.PodRunning && (pod.Status.StartTime == nil || time.Since(pod.Status.StartTime.Time) > 1*time.Minute)
488491
for _, container := range pod.Spec.Containers {
@@ -507,6 +510,15 @@ func (r *MonitorReconciler) monitorPodResourceUsage(namespace string, resUsed ma
507510
} else {
508511
resUsed[podResNamed.String()][corev1.ResourceMemory].Add(container.Resources.Requests[corev1.ResourceMemory])
509512
}
513+
if ephemeralRequest, ok := container.Resources.Limits[corev1.ResourceEphemeralStorage]; ok {
514+
podEphemeralStorage.Add(ephemeralRequest)
515+
} else {
516+
podEphemeralStorage.Add(container.Resources.Requests[corev1.ResourceEphemeralStorage])
517+
}
518+
}
519+
if !skip && podEphemeralStorage.Cmp(ephemeralStorageChargeThreshold) == 1 {
520+
podEphemeralStorage.Sub(ephemeralStorageChargeThreshold)
521+
resUsed[podResNamed.String()][corev1.ResourceStorage].Add(*podEphemeralStorage)
510522
}
511523
}
512524
return nil

0 commit comments

Comments
 (0)