Skip to content

Commit 3d4826a

Browse files
authored
Enhance agent startup logging, more generous startup restrictions (#196)
2.11.1 / CLDYCON-2913
1 parent 04bf7a8 commit 3d4826a

File tree

5 files changed

+154
-35
lines changed

5 files changed

+154
-35
lines changed

charts/metrics-agent/Chart.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@ type: application
1414

1515
# This is the chart version. This version number should be incremented each time you make changes
1616
# to the chart and its templates, including the app version.
17-
version: 2.11.0
17+
version: 2.11.1
1818

1919
# This is the version number of the application being deployed. This version number should be
2020
# incremented each time you make changes to the application.
21-
appVersion: 2.11.0
21+
appVersion: 2.11.1

charts/metrics-agent/values.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ pollInterval: 180
1919

2020
image:
2121
name: cloudability/metrics-agent
22-
tag: 2.11.0
22+
tag: 2.11.1
2323
pullPolicy: Always
2424

2525
imagePullSecrets: []

kubernetes/nodecollection.go

+77-26
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"net/http"
1010
"os"
1111
"sync"
12+
"sync/atomic"
1213
"time"
1314

1415
"github.com/cloudability/metrics-agent/retrieval/raw"
@@ -29,7 +30,7 @@ func (err nodeError) Error() string {
2930
}
3031

3132
const (
32-
FatalNodeError = nodeError("unable to retrieve required set of node metrics via direct or proxy connection")
33+
FatalNodeError = nodeError("unable to retrieve required metrics from any node via direct or proxy connection")
3334
)
3435

3536
// NodeSource is an interface to get a list of Nodes
@@ -346,36 +347,87 @@ func ensureNodeSource(ctx context.Context, config KubeAgentConfig) (KubeAgentCon
346347
return config, fmt.Errorf("error retrieving nodes: %s", err)
347348
}
348349

349-
firstNode := &nodes[0]
350+
directNodes := int32(0)
351+
proxyNodes := int32(0)
352+
failedDirect := int32(0)
353+
failedProxy := int32(0)
354+
directAllowed := allowDirectConnect(config, nodes)
350355

351-
ip, port, err := clientSetNodeSource.NodeAddress(firstNode)
352-
if err != nil {
353-
return config, fmt.Errorf("error retrieving node addresses: %s", err)
354-
}
356+
var wg sync.WaitGroup
355357

356-
if allowDirectConnect(config, nodes) {
357-
// test node direct connectivity
358-
d := directNodeEndpoints(ip, port)
359-
success, err := checkEndpointConnections(config, &nodeHTTPClient, Direct, d.statsSummary())
360-
if err != nil {
361-
return config, err
362-
}
363-
if success {
364-
return config, nil
365-
}
358+
limiter := make(chan struct{}, config.ConcurrentPollers)
359+
360+
for _, n := range nodes {
361+
// block if channel is full (limiting number of goroutines)
362+
limiter <- struct{}{}
363+
wg.Add(1)
364+
go func(currentNode v1.Node) {
365+
defer func() {
366+
<-limiter
367+
wg.Done()
368+
}()
369+
directlyConnected := false
370+
ip, port, err := clientSetNodeSource.NodeAddress(&currentNode)
371+
if err != nil {
372+
log.Warnf("error retrieving node addresses: %s", err)
373+
return
374+
}
375+
if directAllowed {
376+
// test node direct connectivity
377+
d := directNodeEndpoints(ip, port)
378+
success, err := checkEndpointConnections(config, &nodeHTTPClient, Direct, d.statsSummary())
379+
if err != nil {
380+
log.Warnf("Failed to connect to node [%s] directly with cause [%s]",
381+
d.statsSummary(), err.Error())
382+
atomic.AddInt32(&failedDirect, 1)
383+
}
384+
if success {
385+
directlyConnected = true
386+
atomic.AddInt32(&directNodes, 1)
387+
}
388+
}
389+
if !directlyConnected {
390+
p := setupProxyAPI(config.ClusterHostURL, currentNode.Name)
391+
success, err := checkEndpointConnections(config, &config.HTTPClient, Proxy, p.statsSummary())
392+
if err != nil {
393+
log.Warnf("Failed to connect to node [%s] via proxy with cause [%s]",
394+
p.statsSummary(), err.Error())
395+
atomic.AddInt32(&failedProxy, 1)
396+
}
397+
if success {
398+
atomic.AddInt32(&proxyNodes, 1)
399+
}
400+
}
401+
}(n)
366402
}
403+
log.Debugln("Currently Waiting for all node data to be gathered")
404+
wg.Wait()
405+
log.Infof("Of %d nodes, %d connected directly, %d connected via proxy, and %d could not be reached",
406+
len(nodes), directNodes, proxyNodes, failedProxy)
367407

368-
// test node connectivity via kube-proxy
369-
p := setupProxyAPI(config.ClusterHostURL, firstNode.Name)
370-
success, err := checkEndpointConnections(config, &config.HTTPClient, Proxy, p.statsSummary())
371-
if err != nil {
372-
return config, err
408+
if len(nodes) != int(directNodes+proxyNodes) {
409+
pct := int(directNodes+proxyNodes) * 100 / len(nodes)
410+
log.Warnf("Only %d percent of ready nodes could could be connected to, "+
411+
"agent will operate in a limited mode.", pct)
373412
}
374-
if success {
375-
return config, nil
413+
414+
if (directNodes + proxyNodes) == 0 {
415+
return config, FatalNodeError
376416
}
377417

378-
return config, FatalNodeError
418+
validateConfig(config, proxyNodes, directNodes)
419+
return config, nil
420+
}
421+
422+
func validateConfig(config KubeAgentConfig, proxyNodes, directNodes int32) {
423+
if proxyNodes > 0 {
424+
config.NodeMetrics.SetAvailability(NodeStatsSummaryEndpoint, Proxy, true)
425+
} else if directNodes > 0 {
426+
config.NodeMetrics.SetAvailability(NodeStatsSummaryEndpoint, Direct, true)
427+
} else {
428+
config.NodeMetrics.SetAvailability(NodeStatsSummaryEndpoint, Proxy, false)
429+
config.NodeMetrics.SetAvailability(NodeStatsSummaryEndpoint, Direct, false)
430+
}
379431
}
380432

381433
func checkEndpointConnections(config KubeAgentConfig, client *http.Client, method Connection,
@@ -384,8 +436,7 @@ func checkEndpointConnections(config KubeAgentConfig, client *http.Client, metho
384436
if err != nil {
385437
return false, err
386438
}
387-
log.Infof("/stats/summary endpoint available via %s connection? %v", method, ns)
388-
config.NodeMetrics.SetAvailability(NodeStatsSummaryEndpoint, method, ns)
439+
log.Infof("Node [%s] available via %s connection? %v", nodeStatSum, method, ns)
389440

390441
return ns, nil
391442
}

kubernetes/nodecollection_test.go

+73-5
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"context"
55
"crypto/tls"
66
"fmt"
7+
"k8s.io/apimachinery/pkg/runtime"
78
"net/http"
89
"net/http/httptest"
910
"os"
@@ -121,13 +122,18 @@ var nodeSampleLabels = map[string]string{
121122
}
122123

123124
func NewTestClient(ts *httptest.Server, labels map[string]string) *fake.Clientset {
125+
return NewTestClientWithNodes(ts, labels, 1)
126+
}
127+
128+
func NewTestClientWithNodes(ts *httptest.Server, labels map[string]string, numNodes int) *fake.Clientset {
124129
s := strings.Split(ts.Listener.Addr().String(), ":")
125130
ip := s[0]
126131
port, _ := strconv.Atoi(s[1])
127-
return fake.NewSimpleClientset(
128-
&v1.Node{
132+
nodes := make([]runtime.Object, numNodes)
133+
for i := 0; i < numNodes; i++ {
134+
nodes[i] = &v1.Node{
129135
ObjectMeta: metav1.ObjectMeta{
130-
Name: "proxyNode",
136+
Name: fmt.Sprintf("proxyNode.%d", i),
131137
Namespace: v1.NamespaceDefault,
132138
Labels: labels,
133139
},
@@ -148,8 +154,9 @@ func NewTestClient(ts *httptest.Server, labels map[string]string) *fake.Clientse
148154
},
149155
},
150156
},
151-
},
152-
)
157+
}
158+
}
159+
return fake.NewSimpleClientset(nodes...)
153160
}
154161

155162
func TestEnsureNodeSource(t *testing.T) {
@@ -185,6 +192,67 @@ func TestEnsureNodeSource(t *testing.T) {
185192
}
186193
})
187194

195+
t.Run("Ensure successful on mix of node failures and success", func(t *testing.T) {
196+
returnCodes := []int{200, 400, 400}
197+
ts := launchTLSTestServer(returnCodes)
198+
cs := NewTestClientWithNodes(ts, nodeSampleLabels, 2)
199+
defer ts.Close()
200+
ka := KubeAgentConfig{
201+
Clientset: cs,
202+
HTTPClient: http.Client{},
203+
CollectionRetryLimit: 0,
204+
ConcurrentPollers: 10,
205+
NodeMetrics: EndpointMask{},
206+
}
207+
ka, err := ensureNodeSource(context.TODO(), ka)
208+
if err != nil {
209+
t.Errorf("unexpected error: %v", err)
210+
}
211+
if !ka.NodeMetrics.DirectAllowed(NodeStatsSummaryEndpoint) {
212+
t.Errorf("Expected direct node retrieval method but got %v: %v",
213+
ka.NodeMetrics.Options(NodeStatsSummaryEndpoint),
214+
err)
215+
return
216+
}
217+
})
218+
219+
t.Run("Ensure proxy on mix of node direct and proxy", func(t *testing.T) {
220+
returnCodes := []int{200, 400, 200}
221+
ts := launchTLSTestServer(returnCodes)
222+
cs := NewTestClientWithNodes(ts, nodeSampleLabels, 2)
223+
defer ts.Close()
224+
ka := KubeAgentConfig{
225+
Clientset: cs,
226+
CollectionRetryLimit: 0,
227+
ConcurrentPollers: 10,
228+
NodeMetrics: EndpointMask{},
229+
ClusterHostURL: "https://" + ts.Listener.Addr().String(),
230+
// The proxy connection method uses the config http client
231+
HTTPClient: http.Client{Transport: &http.Transport{TLSClientConfig: &tls.Config{
232+
// nolint gosec
233+
InsecureSkipVerify: true,
234+
},
235+
}},
236+
}
237+
ka, err := ensureNodeSource(context.TODO(), ka)
238+
if err != nil {
239+
t.Errorf("unexpected error: %v", err)
240+
}
241+
if ka.NodeMetrics.DirectAllowed(NodeStatsSummaryEndpoint) {
242+
t.Errorf("Expected proxy node retrieval method but got %v: %v",
243+
ka.NodeMetrics.Options(NodeStatsSummaryEndpoint),
244+
err)
245+
return
246+
}
247+
248+
if !ka.NodeMetrics.ProxyAllowed(NodeStatsSummaryEndpoint) {
249+
t.Errorf("Expected proxy node retrieval method but got %v: %v",
250+
ka.NodeMetrics.Options(NodeStatsSummaryEndpoint),
251+
err)
252+
return
253+
}
254+
})
255+
188256
t.Run("Ensure all needed clients function when multiple methods are set", func(t *testing.T) {
189257
// Two endpoints will succeed both times, but stats summary will fail on direct
190258
directConnectionAttempts := []int{200}

version/version.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
package version
22

33
//VERSION is the current version of the agent
4-
var VERSION = "2.11.0"
4+
var VERSION = "2.11.1"

0 commit comments

Comments
 (0)