diff --git a/clusterloader2/pkg/dependency/dra/dra.go b/clusterloader2/pkg/dependency/dra/dra.go index cd82aa67fb..332d2381dc 100644 --- a/clusterloader2/pkg/dependency/dra/dra.go +++ b/clusterloader2/pkg/dependency/dra/dra.go @@ -167,12 +167,6 @@ func (d *draDependency) isDRADriverReady(config *dependency.Config, daemonsetNam } func isResourceSlicesPublished(config *dependency.Config, namespace string) (bool, error) { - // Get a list of all nodes - // nodes, err := getReadyNodesCount(config) - // if err != nil { - // return false, fmt.Errorf("failed to list nodes: %v", err) - // } - driverPluginPods, err := getDriverPluginPods(config, namespace, draDaemonsetName) if err != nil { return false, fmt.Errorf("failed to list driverPluginPods: %v", err) diff --git a/clusterloader2/testing/dra/README.md b/clusterloader2/testing/dra/README.md index e6c2ff93db..db48ac1857 100644 --- a/clusterloader2/testing/dra/README.md +++ b/clusterloader2/testing/dra/README.md @@ -14,8 +14,8 @@ export CL2_MODE=Indexed export CL2_NODES_PER_NAMESPACE=1 export CL2_LOAD_TEST_THROUGHPUT=20 # Fast initial fill export CL2_STEADY_STATE_QPS=5 # Controlled rate for measurement -export CL2_JOB_RUNNING_TIME=30s # Short-lived pods runtime -export CL2_LONG_JOB_RUNNING_TIME=1h # Long-running pods runtime (for cluster fill) +export CL2_SHORT_LIVED_JOB_RUNNING_TIME=30s # Short-lived pods runtime +export CL2_LONG_LIVED_JOB_RUNNING_TIME=1h # Long-running pods runtime (for cluster fill) export CL2_GPUS_PER_NODE=8 # GPUs per node export CL2_FILL_PERCENTAGE=90 # Cluster fill percentage ``` diff --git a/clusterloader2/testing/dra/config.yaml b/clusterloader2/testing/dra/config.yaml index 367fae86fd..41d1b0e3c8 100644 --- a/clusterloader2/testing/dra/config.yaml +++ b/clusterloader2/testing/dra/config.yaml @@ -2,28 +2,47 @@ {{$NODES_PER_NAMESPACE := MinInt .Nodes (DefaultParam .CL2_NODES_PER_NAMESPACE 100)}} {{$LOAD_TEST_THROUGHPUT := DefaultParam .CL2_LOAD_TEST_THROUGHPUT 10}} {{$STEADY_STATE_QPS := DefaultParam .CL2_STEADY_STATE_QPS 5}} +{{$RESOURCE_SLICES_PER_NODE := DefaultParam .CL2_RESOURCE_SLICES_PER_NODE 1}} +{{$LONG_LIVED_JOBS_STARTUP_PERC50_THRESHOLD := DefaultParam .CL2_LONG_LIVED_JOBS_STARTUP_PERC50_THRESHOLD "5s"}} +{{$LONG_LIVED_JOBS_STARTUP_PERC90_THRESHOLD := DefaultParam .CL2_LONG_LIVED_JOBS_STARTUP_PERC90_THRESHOLD "5s"}} +{{$LONG_LIVED_JOBS_STARTUP_PERC100_THRESHOLD := DefaultParam .CL2_LONG_LIVED_JOBS_STARTUP_PERC100_THRESHOLD "5s"}} +{{$CHURN_JOBS_WAIT_PERC50_THRESHOLD := DefaultParam .CL2_CHURN_JOBS_WAIT_PERC50_THRESHOLD "5s"}} +{{$CHURN_JOBS_WAIT_PERC90_THRESHOLD := DefaultParam .CL2_CHURN_JOBS_WAIT_PERC90_THRESHOLD "5s"}} +{{$CHURN_JOBS_WAIT_PERC99_THRESHOLD := DefaultParam .CL2_CHURN_JOBS_WAIT_PERC99_THRESHOLD "5s"}} +{{$SHORT_LIVED_JOBS_WAIT_THRESHOLD := DefaultParam .CL2_SHORT_LIVED_JOBS_WAIT_THRESHOLD "10m"}} +{{$LONG_LIVED_JOBS_WAIT_THRESHOLD := DefaultParam .CL2_LONG_LIVED_JOBS_WAIT_THRESHOLD "10m"}} +{{$RUNNING_JOBS_OPERATION_THRESHOLD := DefaultParam .CL2_RUNNING_JOBS_OPERATION_THRESHOLD "120s"}} {{$token := .CL2_TOKEN }} {{$namespaces := DivideInt .Nodes $NODES_PER_NAMESPACE}} +# dra +{{$draManifests := DefaultParam .CL2_DRA_MANIFESTS "dra-example-driver"}} +{{$draDaemonsetName := DefaultParam .CL2_DRA_DAEMONSET_NAME "dra-example-driver-kubeletplugin"}} + # Node resource configuration {{$gpusPerNode := DefaultParam .CL2_GPUS_PER_NODE 8}} +{{$resourceSlicesPerNode := DefaultParam .CL2_RESOURCE_SLICES_PER_NODE 1}} +{{$totalResourceSliceCount := MultiplyInt $resourceSlicesPerNode .Nodes}} {{$totalGPUs := MultiplyInt $gpusPerNode .Nodes}} # fast fill job configuration - for initial fill up {{$fillPercentage := DefaultParam .CL2_FILL_PERCENTAGE 90}} {{$fillPodsCount := DivideInt (MultiplyInt $totalGPUs $fillPercentage) 100}} {{$fillPodsPerNamespace := DivideInt $fillPodsCount $namespaces}} -{{$longJobSize := 1}} -{{$longJobRunningTime := DefaultParam .CL2_LONG_JOB_RUNNING_TIME "1h"}} +{{$longLivedJobSize := 1}} +{{$longLivedJobRunningTime := DefaultParam .CL2_LONG_LIVED_JOB_RUNNING_TIME "1h"}} # churn job configuration for steady state -{{$smallJobPodsCount := SubtractInt $totalGPUs (MultiplyInt $fillPodsPerNamespace $namespaces)}} -{{$smallJobsPerNamespace := DivideInt $smallJobPodsCount $namespaces}} -{{$smallJobSize := 1}} -{{$smallJobCompletions := 10}} -{{$jobRunningTime := DefaultParam .CL2_JOB_RUNNING_TIME "30s"}} +{{$shortLivedJobPodsCount := SubtractInt $totalGPUs (MultiplyInt $fillPodsPerNamespace $namespaces)}} +{{$calculatedSJPN := DivideInt $shortLivedJobPodsCount $namespaces}} +{{$maxSJPN := DefaultParam .CL2_MAX_SHORT_LIVED_JOBS_PER_NAMESPACE 999999}} +{{$shortLivedJobsPerNamespace := MinInt $calculatedSJPN $maxSJPN}} +{{$shortLivedJobSize := 1}} +{{$shortLivedJobCompletions := DefaultParam .CL2_SHORT_LIVED_JOB_COMPLETIONS 10}} +{{$shortLivedJobRunningTime := DefaultParam .CL2_SHORT_LIVED_JOB_RUNNING_TIME "30s"}} {{$ENABLE_EXTENDED_RESOURCES := DefaultParam .CL2_ENABLE_EXTENDED_RESOURCES false}} +{{$deviceClassName := DefaultParam .CL2_DEVICE_CLASS_NAME "gpu.example.com"}} {{$extendedResourceName := ""}} {{if $ENABLE_EXTENDED_RESOURCES}} @@ -49,7 +68,9 @@ dependencies: - name: Install dra-example-driver for test Method: DRATestDriver Params: - WorkerNodeCount: {{.Nodes}} + WorkerNodeCount: {{$totalResourceSliceCount}} + DaemonsetName: {{$draDaemonsetName}} + Manifests: {{$draManifests}} {{if $ENABLE_EXTENDED_RESOURCES}} ExtendedResourceName: {{$extendedResourceName}} {{end}} @@ -70,12 +91,15 @@ steps: apiVersion: batch/v1 kind: Job labelSelector: job-type = long-running - operationTimeout: 120s + operationTimeout: {{$RUNNING_JOBS_OPERATION_THRESHOLD}} - Identifier: FastFillPodStartupLatency Method: PodStartupLatency Params: action: start labelSelector: job-type = long-running + perc50Threshold: {{$LONG_LIVED_JOBS_STARTUP_PERC50_THRESHOLD}} + perc90Threshold: {{$LONG_LIVED_JOBS_STARTUP_PERC90_THRESHOLD}} + threshold: {{$LONG_LIVED_JOBS_STARTUP_PERC100_THRESHOLD}} - Identifier: FastFillClaimAllocationLatency Method: ResourceClaimAllocationLatency Params: @@ -111,6 +135,8 @@ steps: objectBundle: - basename: single-gpu objectTemplatePath: "resourceclaimtemplate.yaml" + templateFillMap: + DeviceClassName: {{$deviceClassName}} {{end}} - name: Fill cluster to {{$fillPercentage}}% utilization phases: @@ -123,9 +149,9 @@ steps: - basename: long-running objectTemplatePath: "long-running-job.yaml" templateFillMap: - Replicas: {{$longJobSize}} + Replicas: {{$longLivedJobSize}} Mode: {{$MODE}} - Sleep: {{$longJobRunningTime}} + Sleep: {{$longLivedJobRunningTime}} ExtendedResource: {{ $ENABLE_EXTENDED_RESOURCES }} - name: Wait for fill pods to be running measurements: @@ -134,7 +160,7 @@ steps: Params: action: gather labelSelector: job-type = long-running - timeout: 15m + timeout: {{$LONG_LIVED_JOBS_WAIT_THRESHOLD}} - name: Gather measurements for long running pods measurements: - Identifier: FastFillSchedulingMetrics @@ -145,6 +171,9 @@ steps: Method: PodStartupLatency Params: action: gather + perc50Threshold: {{$LONG_LIVED_JOBS_STARTUP_PERC50_THRESHOLD}} + perc90Threshold: {{$LONG_LIVED_JOBS_STARTUP_PERC90_THRESHOLD}} + threshold: {{$LONG_LIVED_JOBS_STARTUP_PERC100_THRESHOLD}} - Identifier: FastFillClaimAllocationLatency Method: ResourceClaimAllocationLatency Params: @@ -164,9 +193,9 @@ steps: Params: action: start labelSelector: job-type = short-lived - perc50Threshold: 40s - perc90Threshold: 60s - perc99Threshold: 80s + perc50Threshold: {{$CHURN_JOBS_WAIT_PERC50_THRESHOLD}} + perc90Threshold: {{$CHURN_JOBS_WAIT_PERC90_THRESHOLD}} + perc99Threshold: {{$CHURN_JOBS_WAIT_PERC99_THRESHOLD}} - Identifier: ChurnClaimAllocationLatency Method: ResourceClaimAllocationLatency Params: @@ -192,16 +221,16 @@ steps: - namespaceRange: min: 1 max: {{$namespaces}} - replicasPerNamespace: {{$smallJobsPerNamespace}} + replicasPerNamespace: {{$shortLivedJobsPerNamespace}} tuningSet: SteadyState objectBundle: - basename: small objectTemplatePath: "job.yaml" templateFillMap: - Replicas: {{$smallJobSize}} - CompletionReplicas: {{$smallJobCompletions}} + Replicas: {{$shortLivedJobSize}} + CompletionReplicas: {{$shortLivedJobCompletions}} Mode: {{$MODE}} - Sleep: {{$jobRunningTime}} + Sleep: {{$shortLivedJobRunningTime}} ExtendedResource: {{ $ENABLE_EXTENDED_RESOURCES }} - name: Wait for short-lived jobs to finish measurements: @@ -210,7 +239,7 @@ steps: Params: action: gather labelSelector: job-type = short-lived - timeout: 15m + timeout: {{$SHORT_LIVED_JOBS_WAIT_THRESHOLD}} - name: Measure scheduler metrics measurements: - Identifier: ChurnSchedulingMetrics @@ -221,9 +250,9 @@ steps: Method: PodStartupLatency Params: action: gather - perc50Threshold: 40s - perc90Threshold: 60s - perc99Threshold: 80s + perc50Threshold: {{$CHURN_JOBS_WAIT_PERC50_THRESHOLD}} + perc90Threshold: {{$CHURN_JOBS_WAIT_PERC90_THRESHOLD}} + perc99Threshold: {{$CHURN_JOBS_WAIT_PERC99_THRESHOLD}} - Identifier: ChurnClaimAllocationLatency Method: ResourceClaimAllocationLatency Params: @@ -231,4 +260,4 @@ steps: - Identifier: ChurnDRAMetrics Method: GenericPrometheusQuery Params: - action: gather + action: gather \ No newline at end of file diff --git a/clusterloader2/testing/dra/job.yaml b/clusterloader2/testing/dra/job.yaml index b9df270291..cbff1104a7 100644 --- a/clusterloader2/testing/dra/job.yaml +++ b/clusterloader2/testing/dra/job.yaml @@ -9,7 +9,9 @@ spec: parallelism: {{.Replicas}} completions: {{.CompletionReplicas}} completionMode: {{.Mode}} - ttlSecondsAfterFinished: 300 + # In tests involving a large number of sequentially created, short-lived jobs, the spin-up time may be significant. + # A TTL of 1 hour should be sufficient to retain the jobs long enough for measurement checks. + ttlSecondsAfterFinished: 3600 # 1 hour template: metadata: labels: diff --git a/clusterloader2/testing/dra/resourceclaimtemplate.yaml b/clusterloader2/testing/dra/resourceclaimtemplate.yaml index 2c64f505a4..8cea562a26 100644 --- a/clusterloader2/testing/dra/resourceclaimtemplate.yaml +++ b/clusterloader2/testing/dra/resourceclaimtemplate.yaml @@ -8,4 +8,4 @@ spec: requests: - name: gpu exactly: - deviceClassName: gpu.example.com \ No newline at end of file + deviceClassName: {{.DeviceClassName}} \ No newline at end of file