-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathai_service_metrics_check.go
More file actions
116 lines (102 loc) · 4.24 KB
/
ai_service_metrics_check.go
File metadata and controls
116 lines (102 loc) · 4.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package conformance
import (
"encoding/json"
"fmt"
"github.com/NVIDIA/aicr/pkg/errors"
"github.com/NVIDIA/aicr/pkg/validator/checks"
)
const prometheusBaseURL = "http://kube-prometheus-prometheus.monitoring.svc:9090"
func init() {
checks.RegisterCheck(&checks.Check{
Name: "ai-service-metrics",
Description: "Verify GPU metrics flow through Prometheus and custom metrics API is available",
Phase: phaseConformance,
Func: CheckAIServiceMetrics,
TestName: "TestAIServiceMetrics",
RequirementID: "accelerator_metrics",
EvidenceTitle: "Accelerator & AI Service Metrics",
EvidenceDescription: "Demonstrates that GPU metrics flow through Prometheus and are available via the Kubernetes custom metrics API for HPA scaling.",
EvidenceFile: "accelerator-metrics.md",
SubmissionRequirement: true,
})
}
// CheckAIServiceMetrics validates CNCF requirement #5: AI Service Metrics.
// Verifies that GPU metric time series exist in Prometheus and that the
// custom metrics API is available.
func CheckAIServiceMetrics(ctx *checks.ValidationContext) error {
return checkAIServiceMetricsWithURL(ctx, prometheusBaseURL)
}
// checkAIServiceMetricsWithURL is the testable implementation that accepts a configurable URL.
func checkAIServiceMetricsWithURL(ctx *checks.ValidationContext, promBaseURL string) error {
if ctx.Clientset == nil {
return errors.New(errors.ErrCodeInvalidRequest, "kubernetes client is not available")
}
// 1. Query Prometheus for GPU metric time series
queryURL := fmt.Sprintf("%s/api/v1/query?query=DCGM_FI_DEV_GPU_UTIL", promBaseURL)
body, err := httpGet(ctx.Context, queryURL)
if err != nil {
return errors.Wrap(errors.ErrCodeUnavailable, "Prometheus unreachable", err)
}
var promResp struct {
Data struct {
Result []json.RawMessage `json:"result"`
} `json:"data"`
}
if err := json.Unmarshal(body, &promResp); err != nil {
return errors.Wrap(errors.ErrCodeInternal, "failed to parse Prometheus response", err)
}
recordArtifact(ctx, "Prometheus Query: DCGM_FI_DEV_GPU_UTIL",
fmt.Sprintf("Endpoint: %s\nTime series count: %d", queryURL, len(promResp.Data.Result)))
if len(promResp.Data.Result) == 0 {
return errors.New(errors.ErrCodeNotFound,
"no DCGM_FI_DEV_GPU_UTIL time series in Prometheus")
}
// 2. Custom metrics API available
rawURL := "/apis/custom.metrics.k8s.io/v1beta1"
restClient := ctx.Clientset.Discovery().RESTClient()
if restClient == nil {
return errors.New(errors.ErrCodeInternal, "discovery REST client is not available")
}
result := restClient.Get().AbsPath(rawURL).Do(ctx.Context)
var statusCode int
result.StatusCode(&statusCode)
if cmErr := result.Error(); cmErr != nil {
recordArtifact(ctx, "Custom Metrics API",
fmt.Sprintf("Endpoint: %s\nHTTP Status: %d\nStatus: unavailable\nError: %v",
rawURL, statusCode, cmErr))
return errors.Wrap(errors.ErrCodeNotFound,
"custom metrics API not available", cmErr)
}
groupVersion := "unknown"
resourceCount := 0
discoveryBody, rawErr := result.Raw()
if rawErr == nil {
var discovery struct {
GroupVersion string `json:"groupVersion"`
Resources []json.RawMessage `json:"resources"`
}
if json.Unmarshal(discoveryBody, &discovery) == nil {
if discovery.GroupVersion != "" {
groupVersion = discovery.GroupVersion
}
resourceCount = len(discovery.Resources)
}
}
recordArtifact(ctx, "Custom Metrics API",
fmt.Sprintf("Endpoint: %s\nHTTP Status: %d\nGroupVersion: %s\nAPI Resources: %d\nStatus: available",
rawURL, statusCode, groupVersion, resourceCount))
return nil
}