Skip to content

Commit 4b56a9a

Browse files
authored
Fix metrics and handle pods baked by custom controllers (#12)
1 parent b438290 commit 4b56a9a

File tree

7 files changed

+84
-12
lines changed

7 files changed

+84
-12
lines changed

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ require (
77
github.com/go-co-op/gocron v1.16.2
88
github.com/pkg/errors v0.9.1
99
github.com/prometheus/client_golang v1.13.0
10+
github.com/prometheus/common v0.37.0
1011
github.com/sirupsen/logrus v1.9.0
1112
github.com/spf13/cobra v1.5.0
1213
github.com/spf13/viper v1.12.0
@@ -53,7 +54,6 @@ require (
5354
github.com/pelletier/go-toml v1.9.5 // indirect
5455
github.com/pelletier/go-toml/v2 v2.0.1 // indirect
5556
github.com/prometheus/client_model v0.2.0 // indirect
56-
github.com/prometheus/common v0.37.0 // indirect
5757
github.com/prometheus/procfs v0.8.0 // indirect
5858
github.com/robfig/cron/v3 v3.0.1 // indirect
5959
github.com/spf13/afero v1.8.2 // indirect

go.sum

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,7 @@ github.com/inconshreveable/mousetrap v1.0.0 h1:Z8tu5sraLXCXIcARxBp/8cbvlwVa7Z1NH
191191
github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANytuPF1OarO4DADm73n8=
192192
github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
193193
github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
194+
github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2EA=
194195
github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4=
195196
github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU=
196197
github.com/json-iterator/go v1.1.10/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4=
@@ -237,6 +238,7 @@ github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00/go.mod
237238
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
238239
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
239240
github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
241+
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f h1:KUppIJq7/+SVif2QVs3tOP0zanoHgBEVAwHxUSIzRqU=
240242
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
241243
github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e h1:fD57ERR4JtEqsWbfPhv4DMiApHyliiK5xCTNVSPiaAs=
242244
github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno=

internal/testing/e2e_test.go

Lines changed: 54 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,19 @@ governing permissions and limitations under the License.
1212
package e2e
1313

1414
import (
15+
"context"
16+
"fmt"
1517
"github.com/adobe/k8s-shredder/pkg/config"
1618
"github.com/adobe/k8s-shredder/pkg/handler"
1719
"github.com/adobe/k8s-shredder/pkg/utils"
20+
"github.com/prometheus/client_golang/api"
21+
v1 "github.com/prometheus/client_golang/api/prometheus/v1"
22+
"github.com/prometheus/common/model"
1823
log "github.com/sirupsen/logrus"
1924
"golang.org/x/exp/slices"
2025
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
26+
"os"
27+
"strings"
2128
"testing"
2229
"time"
2330
)
@@ -91,6 +98,51 @@ func compareTime(expirationTime time.Time, t *testing.T, ch chan time.Time) {
9198
}
9299

93100
func TestShredderMetrics(t *testing.T) {
94-
// TODO add metrics validation tests
95-
t.Log("Metrics validation test passed!")
101+
102+
var warnings []string
103+
var results []string
104+
105+
// Intentionally skipped the gauge metrics as they are going to be wiped out before every eviction loop
106+
shredderMetrics := []string{
107+
"shredder_loops_total",
108+
"shredder_loops_duration_seconds",
109+
"shredder_processed_nodes_total",
110+
"shredder_processed_pods_total",
111+
"shredder_errors_total",
112+
}
113+
114+
for _, shredderMetric := range shredderMetrics {
115+
result, warning, err := prometheusQuery(shredderMetric)
116+
if err != nil {
117+
t.Errorf("Error querying Prometheus: %v\n", err)
118+
}
119+
warnings = append(warnings, warning...)
120+
results = append(results, result.String())
121+
}
122+
123+
if len(warnings) > 0 {
124+
t.Logf("Warnings: %v\n", strings.Join(warnings, "\n"))
125+
}
126+
127+
t.Logf("Results: \n%v\n", strings.Join(results, "\n"))
128+
129+
if len(results) == len(shredderMetrics) {
130+
t.Log("Metrics validation test passed!")
131+
}
132+
}
133+
134+
func prometheusQuery(query string) (model.Value, v1.Warnings, error) {
135+
136+
client, err := api.NewClient(api.Config{
137+
Address: "http://localhost:30007",
138+
})
139+
if err != nil {
140+
fmt.Printf("Error creating client: %v\n", err)
141+
os.Exit(1)
142+
}
143+
144+
v1api := v1.NewAPI(client)
145+
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
146+
defer cancel()
147+
return v1api.Query(ctx, query, time.Now(), v1.WithTimeout(5*time.Second))
96148
}

internal/testing/kind.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@ networking:
55
apiServerAddress: 0.0.0.0
66
nodes:
77
- role: control-plane
8+
extraPortMappings:
9+
- containerPort: 30007
10+
hostPort: 30007
811
kubeadmConfigPatches:
912
- |
1013
kind: InitConfiguration

internal/testing/prometheus_stuffs.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,11 +52,13 @@ metadata:
5252
name: prometheus
5353
namespace: kube-system
5454
spec:
55+
type: NodePort
5556
selector:
5657
app: prometheus
5758
ports:
5859
- port: 9090
5960
targetPort: 9090
61+
nodePort: 30007
6062
---
6163
apiVersion: v1
6264
kind: ConfigMap

pkg/handler/handler.go

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ import (
2626
log "github.com/sirupsen/logrus"
2727
appsv1 "k8s.io/api/apps/v1"
2828
v1 "k8s.io/api/core/v1"
29-
policy "k8s.io/api/policy/v1beta1"
29+
policy "k8s.io/api/policy/v1"
3030
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3131
"k8s.io/apimachinery/pkg/labels"
3232
"k8s.io/apimachinery/pkg/runtime"
@@ -69,9 +69,16 @@ func NewHandler(appContext *utils.AppContext) *Handler {
6969

7070
// Run starts an eviction loop
7171
func (h *Handler) Run() error {
72+
// start measuring the loop duration
7273
loopTimer := prometheus.NewTimer(prometheus.ObserverFunc(func(v float64) {
7374
metrics.ShredderLoopsDurationSeconds.Observe(v * 10e6)
7475
}))
76+
77+
// reset gauge metrics
78+
metrics.ShredderNodeForceToEvictTime.Reset()
79+
metrics.ShredderPodForceToEvictTime.Reset()
80+
metrics.ShredderPodErrorsTotal.Reset()
81+
7582
h.logger.Infof("Starting eviction loop")
7683

7784
// sync all nodes goroutines
@@ -208,7 +215,14 @@ func (h *Handler) processNode(node v1.Node, rr chan *controllerObject) error {
208215
h.logger.WithFields(log.Fields{
209216
"namespace": pod.Namespace,
210217
"pod": pod.Name,
211-
}).Warnf("Failed to get pod controller object: %s", err.Error())
218+
}).Warnf("Failed to get pod controller object: %s. Proceeding directly with pod eviction", err.Error())
219+
err := h.evictPod(pod, deleteOptions)
220+
if err != nil {
221+
h.logger.WithFields(log.Fields{
222+
"namespace": pod.Namespace,
223+
"pod": pod.Name,
224+
}).Warnf("Failed to evict pod: %s", err.Error())
225+
}
212226
continue
213227
}
214228

@@ -296,8 +310,7 @@ func (h *Handler) GetPodsForNode(node v1.Node) ([]v1.Pod, error) {
296310
// evictPod evict a pod using the eviction API
297311
func (h *Handler) evictPod(pod v1.Pod, deleteOptions *metav1.DeleteOptions) error {
298312
h.logger.Infof("Evicting pod %s from %s namespace", pod.Name, pod.Namespace)
299-
// TODO switch to stable V1 API version once we switch to k8s 1.22
300-
err := h.appContext.K8sClient.PolicyV1beta1().Evictions(pod.Namespace).Evict(h.appContext.Context, &policy.Eviction{
313+
err := h.appContext.K8sClient.PolicyV1().Evictions(pod.Namespace).Evict(h.appContext.Context, &policy.Eviction{
301314
ObjectMeta: metav1.ObjectMeta{
302315
Name: pod.Name,
303316
Namespace: pod.Namespace,
@@ -367,9 +380,9 @@ func (h *Handler) getControllerObject(pod v1.Pod) (*controllerObject, error) {
367380
return co, err
368381
}
369382
return newControllerObject("StatefulSet", sts.Name, sts.Namespace, sts), nil
383+
default:
384+
return co, errors.Errorf("Controller object of type %s is not a standard controller", pod.OwnerReferences[0].Kind)
370385
}
371-
372-
return co, errors.Errorf("could not find controller object for type %s\n", pod.OwnerReferences[0].Kind)
373386
}
374387

375388
func (h *Handler) isRolloutRestartInProgress(co *controllerObject) (bool, error) {

pkg/metrics/metrics.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -78,10 +78,10 @@ var (
7878
)
7979

8080
// ShredderPodErrorsTotal = Total pod errors
81-
ShredderPodErrorsTotal = prometheus.NewCounterVec(
82-
prometheus.CounterOpts{
81+
ShredderPodErrorsTotal = prometheus.NewGaugeVec(
82+
prometheus.GaugeOpts{
8383
Name: "shredder_pod_errors_total",
84-
Help: "Total pod errors",
84+
Help: "Total pod errors per eviction loop",
8585
},
8686
[]string{"pod_name", "namespace", "reason", "action"},
8787
)

0 commit comments

Comments
 (0)