Skip to content

Commit 273ebc0

Browse files
committed
Limit the maximum number of registered metric collectors.
Change-Id: I18204ab923b8f02a3fd48f3a6ab34cb45e160b01
1 parent bd50eca commit 273ebc0

4 files changed

Lines changed: 45 additions & 10 deletions

File tree

cmd/csi_driver/main.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ var (
4747
informerResyncDurationSec = flag.Int("informer-resync-duration-sec", 1800, "informer resync duration in seconds")
4848
fuseSocketDir = flag.String("fuse-socket-dir", "/sockets", "FUSE socket directory")
4949
metricsEndpoint = flag.String("metrics-endpoint", "", "The TCP network address where the Prometheus metrics endpoint will listen (example: `:8080`). The default is empty string, which means that the metrics endpoint is disabled.")
50+
maximumNumberOfCollectors = flag.Int("max-metric-collectors", -1, "Maximum number of prometheus metric collectors exporting metrics at a time, less than 0 (e.g -1) means no limit.")
5051

5152
// These are set at compile time.
5253
version = "unknown"
@@ -109,7 +110,7 @@ func main() {
109110
}
110111

111112
if *metricsEndpoint != "" {
112-
mm = metrics.NewMetricsManager(*metricsEndpoint, *fuseSocketDir, clientset)
113+
mm = metrics.NewMetricsManager(*metricsEndpoint, *fuseSocketDir, *maximumNumberOfCollectors, clientset)
113114
mm.InitializeHTTPHandler()
114115
}
115116
}

deploy/base/node/node.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ spec:
5353
- --node=true
5454
- --identity-provider=$(IDENTITY_PROVIDER)
5555
- --metrics-endpoint=:9920
56+
- --max-metric-collectors=10
5657
ports:
5758
- containerPort: 9920
5859
name: metrics

pkg/csi_driver/node.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,8 @@ func (s *nodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublish
114114
s.volumeStateStore.Store(targetPath, &util.VolumeState{})
115115
vs, _ = s.volumeStateStore.Load(targetPath)
116116
}
117-
117+
// volumeState is safe to access for remaining of function since volumeLock prevents
118+
// Node Publish/Unpublish Volume calls from running more than once at a time per volume.
118119
if !vs.BucketAccessCheckPassed {
119120
storageService, err := s.prepareStorageService(ctx, vc)
120121
if err != nil {
@@ -165,7 +166,7 @@ func (s *nodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublish
165166
return nil, status.Error(codes.FailedPrecondition, "failed to find the sidecar container in Pod spec")
166167
}
167168

168-
// Register metrics collecter.
169+
// Register metrics collector.
169170
// It is idempotent to register the same collector in node republish calls.
170171
if s.driver.config.MetricsManager != nil && !disableMetricsCollection {
171172
klog.V(6).Infof("NodePublishVolume enabling metrics collector for target path %q", targetPath)

pkg/metrics/metrics.go

Lines changed: 39 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626
"os"
2727
"path/filepath"
2828
"strings"
29+
"sync"
2930
"time"
3031

3132
"github.com/googlecloudplatform/gcs-fuse-csi-driver/pkg/cloud_provider/clientset"
@@ -56,14 +57,20 @@ type manager struct {
5657
metricsEndpoint string
5758
fuseSocketDir string
5859
clientset clientset.Interface
60+
61+
maximumNumberOfCollectors int
62+
registeredCollectorsCount int
63+
mutex sync.Mutex
5964
}
6065

61-
func NewMetricsManager(metricsEndpoint, fuseSocketDir string, clientset clientset.Interface) Manager {
66+
func NewMetricsManager(metricsEndpoint, fuseSocketDir string, maximumNumberOfCollectors int, clientset clientset.Interface) Manager {
6267
mm := &manager{
63-
registry: prometheus.NewRegistry(),
64-
metricsEndpoint: metricsEndpoint,
65-
fuseSocketDir: fuseSocketDir,
66-
clientset: clientset,
68+
registry: prometheus.NewRegistry(),
69+
metricsEndpoint: metricsEndpoint,
70+
fuseSocketDir: fuseSocketDir,
71+
clientset: clientset,
72+
maximumNumberOfCollectors: maximumNumberOfCollectors,
73+
mutex: sync.Mutex{},
6774
}
6875

6976
return mm
@@ -115,8 +122,25 @@ func (mm *manager) RegisterMetricsCollector(targetPath, podNamespace, podName, b
115122
"bucket_name": bucketName,
116123
"pod_uid": podUID,
117124
}, mm.clientset)
118-
if err := mm.registry.Register(c); err != nil && !strings.Contains(err.Error(), prometheus.AlreadyRegisteredError{}.Error()) {
119-
klog.Errorf("failed to register metrics collector for pod %v/%v, volume %q, bucket %q: %v", podNamespace, podName, volumeName, bucketName, err)
125+
126+
// Lock the number of registered collectors while we attemtp to register a new collector.
127+
mm.mutex.Lock()
128+
defer mm.mutex.Unlock()
129+
130+
// We skip registration when we already registered all supported metrics collectors. If a limit is
131+
// unset (maximumNumberOfCollectors less than zero), we continue to registration.
132+
if mm.maximumNumberOfCollectors >= 0 && mm.registeredCollectorsCount >= mm.maximumNumberOfCollectors {
133+
return
134+
}
135+
136+
err = mm.registry.Register(c)
137+
if err != nil {
138+
if !strings.Contains(err.Error(), prometheus.AlreadyRegisteredError{}.Error()) {
139+
klog.Errorf("failed to register metrics collector for pod %v/%v, volume %q, bucket %q: %v", podNamespace, podName, volumeName, bucketName, err)
140+
}
141+
} else {
142+
mm.registeredCollectorsCount += 1
143+
klog.Infof("successfully registered a new metrics collector: podUID: %s, volume: %s. there's %d collectors registerd.", podUID, bucketName, mm.registeredCollectorsCount)
120144
}
121145
}
122146

@@ -126,8 +150,16 @@ func (mm *manager) UnregisterMetricsCollector(targetPath string) {
126150

127151
// metricsCollector uses a hash of pod UID and volume name as an identifier.
128152
c := NewMetricsCollector("", "", "", "", podUID, volumeName, nil, nil)
153+
154+
// Lock the number of registered collectors while we attempt to unregister a collector.
155+
mm.mutex.Lock()
156+
defer mm.mutex.Unlock()
157+
129158
if ok := mm.registry.Unregister(c); !ok {
130159
klog.Infof("Unregister metrics collector for targetPath %q is not needed since the collector is not registered", targetPath)
160+
} else {
161+
mm.registeredCollectorsCount -= 1
162+
klog.Infof("successfully unregistered a metrics collector: podUID: %s, volume: %s. there's %d collectors registerd.", podUID, volumeName, mm.registeredCollectorsCount)
131163
}
132164
}
133165

0 commit comments

Comments
 (0)