-
Notifications
You must be signed in to change notification settings - Fork 263
Expand file tree
/
Copy pathscenario_cse_perf_test.go
More file actions
353 lines (323 loc) · 17.3 KB
/
scenario_cse_perf_test.go
File metadata and controls
353 lines (323 loc) · 17.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
package e2e
import (
"context"
"testing"
"time"
"github.com/Azure/agentbaker/e2e/config"
"github.com/Azure/agentbaker/pkg/agent/datamodel"
"github.com/Azure/azure-sdk-for-go/sdk/azcore/to"
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7"
)
// CSE performance thresholds for the golden image (cached) path.
// These represent the expected normal performance when all binaries are pre-cached on the VHD.
// If any of these are exceeded, it indicates a regression in CSE task ordering or apt lock contention.
//
// Thresholds are derived from production telemetry (Ubuntu 22.04, GuestAgentGenericLogs table,
// FA database on azcore cluster, ~35K samples per task over 30 minutes):
// - Specific thresholds: set at ~p95 to catch regressions while tolerating normal infra variance
// - DefaultTaskThreshold: 45s catch-all for untracked tasks not covered by specific thresholds
// - holdWALinuxAgent: bimodal distribution (p50=0.49s, p99=58s) due to apt lock contention,
// threshold at p90 since cached path should avoid lock contention
var cachedCSEThresholds = CSETimingThresholds{
TotalCSEThreshold: 60 * time.Second,
DefaultTaskThreshold: 45 * time.Second, // generous 45s catch-all for untracked tasks
TaskThresholds: map[string]time.Duration{
// Core kubelet/containerd install
"installDebPackageFromFile": 22 * time.Second, // prod p50=3.88s p95=21.55s p99=42.88s
"holdWALinuxAgent": 24 * time.Second, // prod p50=0.49s p90=23.32s p95=37.47s (bimodal: apt lock)
"configureKubeletAndKubectl": 27 * time.Second, // prod p50=6.56s p95=26.06s p99=44.39s
"ensureContainerd": 3 * time.Second, // prod p50=0.94s p95=1.99s p99=2.80s
"ensureKubelet": 10 * time.Second, // prod p50=3.27s p95=6.20s p99=10.01s
"installContainerRuntime": 2 * time.Second, // prod p50=0.26s p95=0.50s p99=0.85s
"installStandaloneContainerd": 5 * time.Second, // prod p50=0.10s p95=0.18s p99=0.46s; bumped for E2E variance
// Kubelet install variants (only one fires per VM depending on install path)
"installKubeletKubectlFromPkg": 38 * time.Second, // prod p50=14.68s p95=37.45s p99=56.59s (PMC deb path)
"installKubeletKubectlFromURL": 10 * time.Second, // prod p50=5.43s p95=9.59s p99=15.65s (URL path)
"extractKubeBinaries": 10 * time.Second, // prod p50=5.97s p95=9.72s p99=15.21s
// Credential provider
"installCredentialProviderFromUrl": 2 * time.Second, // prod p50=1.01s p95=1.83s p99=2.89s
"installCredentialProviderFromPkg": 5 * time.Second, // prod p50=1.95s p95=4.72s p99=6.34s
"downloadCredentialProvider": 2 * time.Second, // prod p50=0.63s p95=1.27s p99=2.12s
"installCredentialProvider": 3 * time.Second, // prod p50=0.94s p95=2.68s p99=6.02s
// Networking and node configuration
"retrycmd_nslookup": 4 * time.Second, // prod p50=0.55s p95=3.89s p99=5.60s
"configureNodeExporter": 44 * time.Second, // prod p50=1.62s p95=43.9s p99=117.45s (high tail!)
"ubuntuSnapshotUpdate": 2 * time.Second, // prod p50=0.59s p95=1.15s p99=1.55s
},
}
// CSE performance thresholds for the full install path.
// These are more generous since the full path includes downloading and installing packages.
//
// Thresholds are derived from production telemetry (Ubuntu 22.04, same source as cached).
// Full install thresholds are set at ~p99 since the full install path is rarer and more variable.
var fullInstallCSEThresholds = CSETimingThresholds{
TotalCSEThreshold: 180 * time.Second,
DefaultTaskThreshold: 60 * time.Second, // generous catch-all for untracked tasks
TaskThresholds: map[string]time.Duration{
// Core kubelet/containerd install
"installDeps": 90 * time.Second, // no direct prod data; generous for full install
"installContainerRuntime": 60 * time.Second, // prod p50=0.26s p99=0.78s (cached); much higher on full
"installDebPackageFromFile": 45 * time.Second, // prod p99=42.88s
"holdWALinuxAgent": 60 * time.Second, // prod p99=58.07s (bimodal: apt lock contention)
"configureKubeletAndKubectl": 45 * time.Second, // prod p99=44.39s
"ensureContainerd": 5 * time.Second, // prod p99=2.80s; slightly higher for full install
"ensureKubelet": 15 * time.Second, // prod p99=10.01s; slightly higher for full install
"installStandaloneContainerd": 5 * time.Second, // prod p99=0.46s; bumped for E2E variance
// Kubelet install variants
"installKubeletKubectlFromPkg": 57 * time.Second, // prod p99=56.59s
"installKubeletKubectlFromURL": 16 * time.Second, // prod p99=15.65s
"extractKubeBinaries": 16 * time.Second, // prod p50=5.97s p95=9.72s p99=15.21s
// Credential provider
"installCredentialProviderFromUrl": 3 * time.Second, // prod p99=2.89s
"installCredentialProviderFromPkg": 7 * time.Second, // prod p99=6.34s
"downloadCredentialProvider": 3 * time.Second, // prod p99=2.12s
"installCredentialProvider": 7 * time.Second, // prod p99=6.02s
// Networking and node configuration
"retrycmd_nslookup": 6 * time.Second, // prod p99=5.60s
"configureNodeExporter": 120 * time.Second, // prod p99=117.45s (extreme tail!)
"ubuntuSnapshotUpdate": 2 * time.Second, // prod p99=1.55s
"downloadPkgFromVersion": 4 * time.Second, // prod p50=0.30s p95=1.04s p99=3.39s
},
}
// CSE performance thresholds for Ubuntu 24.04 (cached path).
// Derived from production telemetry (GuestAgentGenericLogs, FA/azcore, ~500 samples per task over 10 minutes).
// Ubuntu 24.04 has similar CSE tasks to 22.04 but with slightly different latency profiles.
var cachedCSEThresholdsUbuntu2404 = CSETimingThresholds{
TotalCSEThreshold: 60 * time.Second,
DefaultTaskThreshold: 45 * time.Second,
TaskThresholds: map[string]time.Duration{
// Core kubelet/containerd install
"installDebPackageFromFile": 24 * time.Second, // prod p50=4.92s p95=23.74s p99=33.47s
"holdWALinuxAgent": 11 * time.Second, // prod p50=4.45s p95=10.97s p99=15.09s (less bimodal than 22.04)
"configureKubeletAndKubectl": 38 * time.Second, // prod p50=21.65s p95=37.28s p99=45.94s
"ensureContainerd": 2 * time.Second, // prod p50=0.76s p95=1.34s p99=1.84s
"ensureKubelet": 8 * time.Second, // prod p50=4.32s p95=7.47s p99=10.50s
"installContainerRuntime": 2 * time.Second, // same as 22.04
"installStandaloneContainerd": 5 * time.Second, // same as 22.04; bumped for E2E variance
// Kubelet install variants
"installKubeletKubectlFromPkg": 37 * time.Second, // prod p50=21.39s p95=36.16s p99=44.51s
"installKubeletKubectlFromURL": 7 * time.Second, // prod p50=1.16s p95=6.42s (small sample)
"extractKubeBinaries": 7 * time.Second, // prod p50=6.28s (small sample)
// Credential provider
"installCredentialProviderFromUrl": 2 * time.Second, // prod p50=0.74s p95=1.49s
"installCredentialProviderFromPkg": 7 * time.Second, // prod p50=3.01s p95=6.21s p99=8.57s
"downloadCredentialProvider": 2 * time.Second, // prod p50=0.41s p95=1.22s
// Networking and node configuration
"configureNodeExporter": 44 * time.Second, // prod p50=1.37s p95=11.48s p99=60.68s
"ubuntuSnapshotUpdate": 2 * time.Second, // same as 22.04
},
}
// CSE performance thresholds for Ubuntu 24.04 (full install path).
var fullInstallCSEThresholdsUbuntu2404 = CSETimingThresholds{
TotalCSEThreshold: 180 * time.Second,
DefaultTaskThreshold: 60 * time.Second,
TaskThresholds: map[string]time.Duration{
"installDeps": 90 * time.Second,
"installContainerRuntime": 60 * time.Second,
"installDebPackageFromFile": 34 * time.Second, // prod p99=33.47s
"holdWALinuxAgent": 16 * time.Second, // prod p99=15.09s (better than 22.04)
"configureKubeletAndKubectl": 46 * time.Second, // prod p99=45.94s
"ensureContainerd": 3 * time.Second, // prod p99=1.84s
"ensureKubelet": 11 * time.Second, // prod p99=10.50s
"installStandaloneContainerd": 5 * time.Second,
"installKubeletKubectlFromPkg": 45 * time.Second, // prod p99=44.51s
"installKubeletKubectlFromURL": 16 * time.Second,
"extractKubeBinaries": 16 * time.Second,
"installCredentialProviderFromUrl": 3 * time.Second,
"installCredentialProviderFromPkg": 9 * time.Second, // prod p99=8.57s
"downloadCredentialProvider": 3 * time.Second,
"configureNodeExporter": 61 * time.Second, // prod p99=60.68s
"ubuntuSnapshotUpdate": 2 * time.Second,
},
}
// CSE performance thresholds for Azure Linux V3 (cached path).
// Derived from production telemetry (GuestAgentGenericLogs, FA/azcore, ~1K samples per task over 10 minutes).
// AzureLinux uses RPM packages, not apt/deb — no holdWALinuxAgent or installDebPackageFromFile tasks.
// Only includes tasks that are actually emitted by Azure Linux V3 CSE (no Ubuntu-specific tasks).
var cachedCSEThresholdsAzureLinuxV3 = CSETimingThresholds{
TotalCSEThreshold: 60 * time.Second,
DefaultTaskThreshold: 45 * time.Second,
TaskThresholds: map[string]time.Duration{
// Core kubelet/containerd install (RPM-based, no apt lock contention)
"configureKubeletAndKubectl": 34 * time.Second, // prod p50=4.56s p95=33.57s p99=47.93s
"ensureContainerd": 2 * time.Second, // prod p50=0.81s p95=1.22s p99=1.59s
"ensureKubelet": 5 * time.Second, // prod p50=2.47s p95=4.85s p99=9.31s
"installKubeletKubectlFromPkg": 52 * time.Second, // prod p50=29.03s p95=51.86s p99=65.20s
// Networking and node configuration
"configureNodeExporter": 10 * time.Second, // prod p50=1.60s p95=9.84s p99=42.35s
"ensureNoDupOnPromiscuBridge": 14 * time.Second, // prod p50=0.70s p95=7.59s p99=13.30s
},
}
// CSE performance thresholds for Azure Linux V3 (full install path).
// Only includes tasks that are actually emitted by Azure Linux V3 CSE.
var fullInstallCSEThresholdsAzureLinuxV3 = CSETimingThresholds{
TotalCSEThreshold: 180 * time.Second,
DefaultTaskThreshold: 60 * time.Second,
TaskThresholds: map[string]time.Duration{
"installContainerRuntime": 60 * time.Second,
"configureKubeletAndKubectl": 48 * time.Second, // prod p99=47.93s
"ensureContainerd": 3 * time.Second, // prod p99=1.59s
"ensureKubelet": 10 * time.Second, // prod p99=9.31s
"installKubeletKubectlFromPkg": 66 * time.Second, // prod p99=65.20s
"configureNodeExporter": 43 * time.Second, // prod p99=42.35s
"ensureNoDupOnPromiscuBridge": 14 * time.Second, // prod p99=13.30s
"enableLocalDNS": 24 * time.Second, // prod p50=0s p95=12.85s p99=23.16s
},
}
func Test_Ubuntu2204_CSE_CachedPerformance(t *testing.T) {
RunScenario(t, &Scenario{
Description: "Validates CSE timing on the golden image (cached) path where binaries are pre-installed on VHD. " +
"Forces the PMC deb package install path (installKubeletKubectlFromPkg → installDebPackageFromFile) " +
"by clearing CustomKubeBinaryURL and setting ShouldEnforceKubePMCInstall with k8s 1.34. " +
"This catches regressions like apt lock contention when task ordering changes.",
Config: Config{
Cluster: ClusterKubenet,
VHD: config.VHDUbuntu2204Gen2Containerd,
SkipScriptlessNBC: true,
EagerCSETimingExtraction: true,
BootstrapConfigMutator: func(_ *Cluster, nbc *datamodel.NodeBootstrappingConfiguration) {
// Disable scriptless CSE so traditional CSE scripts run and emit timing events
nbc.EnableScriptlessCSECmd = false
// The default 1.30 only has tarballs, not .deb files, so it would never
// exercise the installDebPackageFromFile code path.
nbc.ContainerService.Properties.OrchestratorProfile.OrchestratorVersion = "1.34.4"
nbc.AgentPoolProfile.KubernetesConfig.CustomKubeProxyImage = "mcr.microsoft.com/oss/kubernetes/kube-proxy:v1.34.4"
// Clear CustomKubeBinaryURL to prevent the URL-based install path.
// In production, many nodes use the PMC deb package path, not the URL path.
nbc.AgentPoolProfile.KubernetesConfig.CustomKubeBinaryURL = ""
},
VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) {
if vmss.Tags == nil {
vmss.Tags = map[string]*string{}
}
// Force the PMC deb package install path even on the E2E cluster.
// Without this, the CSE would fall back to the URL path which doesn't exercise
// installDebPackageFromFile (the function that caused the regression).
vmss.Tags["ShouldEnforceKubePMCInstall"] = to.Ptr("true")
},
Validator: func(ctx context.Context, s *Scenario) {
ValidateCSETimings(ctx, s, cachedCSEThresholds)
},
},
})
}
func Test_Ubuntu2204_CSE_FullInstallPerformance(t *testing.T) {
RunScenario(t, &Scenario{
Description: "Validates CSE timing on the full install path where all dependencies are installed from scratch. " +
"Uses SkipBinaryCleanup VMSS tag to force FULL_INSTALL_REQUIRED=true.",
Config: Config{
Cluster: ClusterKubenet,
VHD: config.VHDUbuntu2204Gen2Containerd,
SkipScriptlessNBC: true,
EagerCSETimingExtraction: true,
BootstrapConfigMutator: func(_ *Cluster, nbc *datamodel.NodeBootstrappingConfiguration) {
nbc.EnableScriptlessCSECmd = false
},
VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) {
if vmss.Tags == nil {
vmss.Tags = map[string]*string{}
}
vmss.Tags["SkipBinaryCleanup"] = to.Ptr("true")
},
Validator: func(ctx context.Context, s *Scenario) {
ValidateCSETimings(ctx, s, fullInstallCSEThresholds)
},
},
})
}
// --- Ubuntu 24.04 CSE Performance Tests ---
func Test_Ubuntu2404_CSE_CachedPerformance(t *testing.T) {
RunScenario(t, &Scenario{
Description: "Validates CSE timing on the golden image (cached) path for Ubuntu 24.04. " +
"Forces the PMC deb package install path by clearing CustomKubeBinaryURL and setting ShouldEnforceKubePMCInstall.",
Config: Config{
Cluster: ClusterKubenet,
VHD: config.VHDUbuntu2404Gen2Containerd,
SkipScriptlessNBC: true,
EagerCSETimingExtraction: true,
BootstrapConfigMutator: func(_ *Cluster, nbc *datamodel.NodeBootstrappingConfiguration) {
// Disable scriptless CSE so traditional CSE scripts run and emit timing events
nbc.EnableScriptlessCSECmd = false
nbc.ContainerService.Properties.OrchestratorProfile.OrchestratorVersion = "1.34.4"
nbc.AgentPoolProfile.KubernetesConfig.CustomKubeProxyImage = "mcr.microsoft.com/oss/kubernetes/kube-proxy:v1.34.4"
nbc.AgentPoolProfile.KubernetesConfig.CustomKubeBinaryURL = ""
},
VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) {
if vmss.Tags == nil {
vmss.Tags = map[string]*string{}
}
vmss.Tags["ShouldEnforceKubePMCInstall"] = to.Ptr("true")
},
Validator: func(ctx context.Context, s *Scenario) {
ValidateCSETimings(ctx, s, cachedCSEThresholdsUbuntu2404)
},
},
})
}
func Test_Ubuntu2404_CSE_FullInstallPerformance(t *testing.T) {
RunScenario(t, &Scenario{
Description: "Validates CSE timing on the full install path for Ubuntu 24.04. " +
"Uses SkipBinaryCleanup VMSS tag to force FULL_INSTALL_REQUIRED=true.",
Config: Config{
Cluster: ClusterKubenet,
VHD: config.VHDUbuntu2404Gen2Containerd,
SkipScriptlessNBC: true,
EagerCSETimingExtraction: true,
BootstrapConfigMutator: func(_ *Cluster, nbc *datamodel.NodeBootstrappingConfiguration) {
nbc.EnableScriptlessCSECmd = false
},
VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) {
if vmss.Tags == nil {
vmss.Tags = map[string]*string{}
}
vmss.Tags["SkipBinaryCleanup"] = to.Ptr("true")
},
Validator: func(ctx context.Context, s *Scenario) {
ValidateCSETimings(ctx, s, fullInstallCSEThresholdsUbuntu2404)
},
},
})
}
// --- Azure Linux V3 CSE Performance Tests ---
func Test_AzureLinuxV3_CSE_CachedPerformance(t *testing.T) {
RunScenario(t, &Scenario{
Description: "Validates CSE timing on the golden image (cached) path for Azure Linux V3. " +
"Azure Linux uses RPM packages — no apt lock contention, but different install paths.",
Config: Config{
Cluster: ClusterKubenet,
VHD: config.VHDAzureLinuxV3Gen2,
SkipScriptlessNBC: true,
EagerCSETimingExtraction: true,
BootstrapConfigMutator: func(_ *Cluster, nbc *datamodel.NodeBootstrappingConfiguration) {
nbc.EnableScriptlessCSECmd = false
},
Validator: func(ctx context.Context, s *Scenario) {
ValidateCSETimings(ctx, s, cachedCSEThresholdsAzureLinuxV3)
},
},
})
}
func Test_AzureLinuxV3_CSE_FullInstallPerformance(t *testing.T) {
RunScenario(t, &Scenario{
Description: "Validates CSE timing on the full install path for Azure Linux V3. " +
"Uses SkipBinaryCleanup VMSS tag to force FULL_INSTALL_REQUIRED=true.",
Config: Config{
Cluster: ClusterKubenet,
VHD: config.VHDAzureLinuxV3Gen2,
SkipScriptlessNBC: true,
EagerCSETimingExtraction: true,
BootstrapConfigMutator: func(_ *Cluster, nbc *datamodel.NodeBootstrappingConfiguration) {
nbc.EnableScriptlessCSECmd = false
},
VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) {
if vmss.Tags == nil {
vmss.Tags = map[string]*string{}
}
vmss.Tags["SkipBinaryCleanup"] = to.Ptr("true")
},
Validator: func(ctx context.Context, s *Scenario) {
ValidateCSETimings(ctx, s, fullInstallCSEThresholdsAzureLinuxV3)
},
},
})
}