Skip to content

Commit 2012f0b

Browse files
authored
[metrics] Add partition count metrics for tables and cluster monitoring (#1662)
* [FLUSS-1571] Add partition count metric for cluster monitoring - Add PARTITION_COUNT metric name constant - Implement getTotalPartitionCount() in CoordinatorContext - Add partition count metric registration and updates in CoordinatorEventManager - Follow existing TABLE_COUNT metric implementation pattern - Add basic tests for partition count functionality This provides a simple way to monitor total partition count across the cluster, helping users identify when too many partitions might cause cluster instability. * [FLUSS-1571] [docs] Add partition count metrics documentation - Add partitionCount metric documentation for cluster-level monitoring - Document the new metric that tracks total number of partitions in the cluster - This metric helps users monitor partition distribution and identify potential cluster instability issues * [FLUSS-1571] [docs] Fix table rowspan for partition count metrics
1 parent 65f7bb0 commit 2012f0b

File tree

4 files changed

+20
-2
lines changed

4 files changed

+20
-2
lines changed

fluss-common/src/main/java/org/apache/fluss/metrics/MetricNames.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ public class MetricNames {
4040
public static final String OFFLINE_BUCKET_COUNT = "offlineBucketCount";
4141
public static final String TABLE_COUNT = "tableCount";
4242
public static final String BUCKET_COUNT = "bucketCount";
43+
public static final String PARTITION_COUNT = "partitionCount";
4344
public static final String REPLICAS_TO_DELETE_COUNT = "replicasToDeleteCount";
4445

4546
// for coordinator event processor

fluss-server/src/main/java/org/apache/fluss/server/coordinator/CoordinatorContext.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -657,4 +657,8 @@ public void resetContext() {
657657
liveTabletServers.clear();
658658
shuttingDownTabletServers.clear();
659659
}
660+
661+
public int getTotalPartitionCount() {
662+
return partitionAssignments.size();
663+
}
660664
}

fluss-server/src/main/java/org/apache/fluss/server/coordinator/event/CoordinatorEventManager.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ public final class CoordinatorEventManager implements EventManager {
6666
private volatile int offlineBucketCount;
6767
private volatile int tableCount;
6868
private volatile int bucketCount;
69+
private volatile int partitionCount;
6970
private volatile int replicasToDeleteCount;
7071

7172
private static final int WINDOW_SIZE = 100;
@@ -91,6 +92,7 @@ private void registerMetrics() {
9192
coordinatorMetricGroup.gauge(MetricNames.OFFLINE_BUCKET_COUNT, () -> offlineBucketCount);
9293
coordinatorMetricGroup.gauge(MetricNames.BUCKET_COUNT, () -> bucketCount);
9394
coordinatorMetricGroup.gauge(MetricNames.TABLE_COUNT, () -> tableCount);
95+
coordinatorMetricGroup.gauge(MetricNames.PARTITION_COUNT, () -> partitionCount);
9496
coordinatorMetricGroup.gauge(
9597
MetricNames.REPLICAS_TO_DELETE_COUNT, () -> replicasToDeleteCount);
9698
}
@@ -104,6 +106,7 @@ private void updateMetricsViaAccessContext() {
104106
int tabletServerCount = context.getLiveTabletServers().size();
105107
int tableCount = context.allTables().size();
106108
int bucketCount = context.bucketLeaderAndIsr().size();
109+
int partitionCount = context.getTotalPartitionCount();
107110
int offlineBucketCount = context.getOfflineBucketCount();
108111

109112
int replicasToDeletes = 0;
@@ -135,6 +138,7 @@ private void updateMetricsViaAccessContext() {
135138
tabletServerCount,
136139
tableCount,
137140
bucketCount,
141+
partitionCount,
138142
offlineBucketCount,
139143
replicasToDeletes);
140144
});
@@ -147,6 +151,7 @@ private void updateMetricsViaAccessContext() {
147151
this.tabletServerCount = metricsData.tabletServerCount;
148152
this.tableCount = metricsData.tableCount;
149153
this.bucketCount = metricsData.bucketCount;
154+
this.partitionCount = metricsData.partitionCount;
150155
this.offlineBucketCount = metricsData.offlineBucketCount;
151156
this.replicasToDeleteCount = metricsData.replicasToDeleteCount;
152157
} catch (Exception e) {
@@ -268,18 +273,21 @@ private static class MetricsData {
268273
private final int tabletServerCount;
269274
private final int tableCount;
270275
private final int bucketCount;
276+
private final int partitionCount;
271277
private final int offlineBucketCount;
272278
private final int replicasToDeleteCount;
273279

274280
public MetricsData(
275281
int tabletServerCount,
276282
int tableCount,
277283
int bucketCount,
284+
int partitionCount,
278285
int offlineBucketCount,
279286
int replicasToDeleteCount) {
280287
this.tabletServerCount = tabletServerCount;
281288
this.tableCount = tableCount;
282289
this.bucketCount = bucketCount;
290+
this.partitionCount = partitionCount;
283291
this.offlineBucketCount = offlineBucketCount;
284292
this.replicasToDeleteCount = replicasToDeleteCount;
285293
}

website/docs/maintenance/observability/monitor-metrics.md

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -294,8 +294,8 @@ Some metrics might not be exposed when using other JVM implementations (e.g. IBM
294294
</thead>
295295
<tbody>
296296
<tr>
297-
<th rowspan="12"><strong>coordinator</strong></th>
298-
<td style={{textAlign: 'center', verticalAlign: 'middle' }} rowspan="7">-</td>
297+
<th rowspan="13"><strong>coordinator</strong></th>
298+
<td style={{textAlign: 'center', verticalAlign: 'middle' }} rowspan="8">-</td>
299299
<td>activeCoordinatorCount</td>
300300
<td>The number of active CoordinatorServer in this cluster.</td>
301301
<td>Gauge</td>
@@ -320,6 +320,11 @@ Some metrics might not be exposed when using other JVM implementations (e.g. IBM
320320
<td>The total number of buckets in this cluster.</td>
321321
<td>Gauge</td>
322322
</tr>
323+
<tr>
324+
<td>partitionCount</td>
325+
<td>The total number of partitions in this cluster.</td>
326+
<td>Gauge</td>
327+
</tr>
323328
<tr>
324329
<td>replicasToDeleteCount</td>
325330
<td>The total number of replicas in the progress to be deleted in this cluster.</td>

0 commit comments

Comments
 (0)