Skip to content

Commit 266d25e

Browse files
committed
Updated messages possibly displayed on Monitor
Moved message computation to its own method. Added new Resource category.
1 parent 81bc40c commit 266d25e

2 files changed

Lines changed: 92 additions & 18 deletions

File tree

server/monitor/src/main/java/org/apache/accumulo/monitor/next/InformationFetcher.java

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@
5252
import org.apache.accumulo.core.util.UtilWaitThread;
5353
import org.apache.accumulo.core.util.compaction.ExternalCompactionUtil;
5454
import org.apache.accumulo.core.util.threads.ThreadPools;
55+
import org.apache.accumulo.monitor.next.SystemInformation.MessageCategory;
56+
import org.apache.accumulo.monitor.next.SystemInformation.MessagePriority;
5557
import org.apache.accumulo.server.ServerContext;
5658
import org.apache.accumulo.server.compaction.CompactionPluginUtils;
5759
import org.checkerframework.checker.nullness.qual.Nullable;
@@ -321,11 +323,15 @@ public void run() {
321323
while (!futures.isEmpty()) {
322324

323325
if (NanoTime.millisElapsed(allFuturesAdded, NanoTime.now()) > monitorFetchTimeout) {
324-
LOG.warn(
325-
"Fetching information for Monitor has taken longer {}. Cancelling all"
326-
+ " remaining tasks and monitor will display old information. Resolve issue"
327-
+ " causing this or increase property {}.",
328-
monitorFetchTimeout, Property.MONITOR_FETCH_TIMEOUT.getKey());
326+
String message =
327+
"Fetching information for Monitor has taken longer {}ms. Cancelling all remaining tasks (%s) "
328+
+ "and monitor will display old information. Resolve issue causing this or increase property {}."
329+
.formatted(monitorFetchTimeout, futures.size(),
330+
Property.MONITOR_FETCH_TIMEOUT.getKey());
331+
// Log and add to existing summary
332+
summaryRef.get().addMessage(MessagePriority.Critical, MessageCategory.Configuration,
333+
message);
334+
LOG.warn(message);
329335
tookToLong = true;
330336
}
331337

server/monitor/src/main/java/org/apache/accumulo/monitor/next/SystemInformation.java

Lines changed: 81 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import static com.google.common.base.Suppliers.memoize;
2222
import static org.apache.accumulo.core.metrics.MetricsInfo.QUEUE_TAG_KEY;
2323
import static org.apache.accumulo.monitor.next.SystemInformation.MessageCategory.Configuration;
24+
import static org.apache.accumulo.monitor.next.SystemInformation.MessageCategory.Resource;
2425
import static org.apache.accumulo.monitor.next.SystemInformation.MessageCategory.Table;
2526
import static org.apache.accumulo.monitor.next.SystemInformation.MessagePriority.Critical;
2627
import static org.apache.accumulo.monitor.next.SystemInformation.MessagePriority.High;
@@ -376,7 +377,7 @@ public enum MessagePriority {
376377
}
377378

378379
public enum MessageCategory {
379-
Configuration, Table;
380+
Configuration, Resource, Table;
380381
}
381382

382383
private static final Logger LOG = LoggerFactory.getLogger(SystemInformation.class);
@@ -493,7 +494,7 @@ public void clear() {
493494
serverMetricsView.clear();
494495
}
495496

496-
private void addMessage(MessagePriority pri, MessageCategory cat, String msg) {
497+
public void addMessage(MessagePriority pri, MessageCategory cat, String msg) {
497498
messages.computeIfAbsent(pri, k -> new EnumMap<>(MessageCategory.class))
498499
.computeIfAbsent(cat, k -> new TreeSet<>()).add(msg);
499500
}
@@ -676,10 +677,6 @@ public void processTabletInformation(TableId tableId, String tableName, TabletIn
676677
tablets.computeIfAbsent(tableId, (t) -> Collections.synchronizedList(new ArrayList<>()))
677678
.add(sti);
678679
tables.computeIfAbsent(tableId, (t) -> new TableSummary(tableName)).addTablet(sti);
679-
if (sti.getEstimatedEntries() == 0) {
680-
addMessage(Info, Table, "Tablet " + sti.getTabletId().toString() + " (tid: "
681-
+ sti.getTabletId().getTable() + ") may have zero entries and could be merged.");
682-
}
683680
}
684681

685682
public void processError(ServerId server) {
@@ -703,13 +700,50 @@ public void addConfiguredCompactionGroups(Set<String> groups) {
703700
configuredCompactionResourceGroups.addAll(groups);
704701
}
705702

706-
public void finish() {
707-
// Update the deployment not-responded numbers based
708-
// on metric fetch failures for this refresh.
709-
metricProblemHosts.forEach(serverId -> {
710-
deployment.computeIfAbsent(serverId.getResourceGroup(), g -> new ConcurrentHashMap<>())
711-
.computeIfAbsent(serverId.getType(), t -> new ProcessSummary()).addNotResponded(serverId);
703+
private void computeMessages() {
704+
705+
if (managers.isEmpty()) {
706+
addMessage(Critical, Resource, "No Managers are running");
707+
}
708+
709+
if (gc.get() == null) {
710+
addMessage(Critical, Resource, "Garbage Collector is not running");
711+
}
712+
713+
if (problemHosts.size() > 0) {
714+
addMessage(Info, Resource, "Monitor has not recevied a response from " + problemHosts.size()
715+
+ " servers in the last 10 minutes");
716+
}
717+
718+
if (metricProblemHosts.size() > 0) {
719+
addMessage(Info, Resource,
720+
"Unable to gather information from " + metricProblemHosts.size() + " servers");
721+
}
722+
723+
for (ResourceGroupId rg : ctx.resourceGroupOperations().list()) {
724+
if (rg == ResourceGroupId.DEFAULT) {
725+
continue;
726+
}
727+
if (!compactors.containsKey(rg.canonical()) && !sservers.containsKey(rg.canonical())
728+
&& !tservers.containsKey(rg.canonical())) {
729+
addMessage(Info, Configuration, "Resource Group " + rg
730+
+ " exists, but no resources assigned. Consider removing the resource group");
731+
}
732+
}
733+
734+
tablets.forEach((tid, tablets) -> {
735+
int empty = 0;
736+
for (TabletInformation tablet : tablets) {
737+
if (tablet.getEstimatedEntries() == 0) {
738+
empty++;
739+
}
740+
}
741+
if (empty > 0) {
742+
addMessage(Info, Table,
743+
"Table " + tid + " may have " + empty + " tablets that could be merged.");
744+
}
712745
});
746+
713747
for (SystemTables table : SystemTables.values()) {
714748
TableConfiguration tconf = this.ctx.getTableConfiguration(table.tableId());
715749
String balancerRG = tconf.get(TableLoadBalancer.TABLE_ASSIGNMENT_GROUP_PROPERTY);
@@ -721,6 +755,28 @@ public void finish() {
721755
}
722756
}
723757

758+
FMetric flatbuffer = new FMetric();
759+
long serversWithZombieScans = 0;
760+
for (Entry<ServerId,MetricResponse> e : allMetrics.asMap().entrySet()) {
761+
ServerId sid = e.getKey();
762+
List<ByteBuffer> metrics = e.getValue().metrics;
763+
if (sid.getType() == ServerId.Type.SCAN_SERVER
764+
|| sid.getType() == ServerId.Type.TABLET_SERVER) {
765+
for (ByteBuffer binary : metrics) {
766+
flatbuffer = FMetric.getRootAsFMetric(binary, flatbuffer);
767+
if (flatbuffer.name().equals(Metric.SCAN_ZOMBIE_THREADS.getName())) {
768+
if (getMetricValue(flatbuffer).longValue() > 0) {
769+
serversWithZombieScans++;
770+
}
771+
}
772+
}
773+
}
774+
}
775+
if (serversWithZombieScans > 0) {
776+
addMessage(High, Resource,
777+
"There are " + serversWithZombieScans + " servers with zombie scan threads");
778+
}
779+
724780
for (String rg : getResourceGroups()) {
725781
Set<ServerId> rgCompactors = getCompactorResourceGroupServers(rg);
726782
List<FMetric> metrics = queueMetrics.get(rg);
@@ -749,7 +805,7 @@ public void finish() {
749805
if (idleMetric.isPresent()) {
750806
var metric = idleMetric.orElseThrow().getValue();
751807
if (metric.max() == 1.0D) {
752-
addMessage(High, Configuration,
808+
addMessage(High, Resource,
753809
"Compactor group " + rg + " has queued jobs and idle compactors.");
754810
}
755811
}
@@ -766,6 +822,18 @@ public void finish() {
766822
}
767823
}
768824

825+
}
826+
827+
public void finish() {
828+
// Update the deployment not-responded numbers based
829+
// on metric fetch failures for this refresh.
830+
metricProblemHosts.forEach(serverId -> {
831+
deployment.computeIfAbsent(serverId.getResourceGroup(), g -> new ConcurrentHashMap<>())
832+
.computeIfAbsent(serverId.getType(), t -> new ProcessSummary()).addNotResponded(serverId);
833+
});
834+
835+
computeMessages();
836+
769837
timestamp.set(System.currentTimeMillis());
770838
componentStatuses.clear();
771839
for (final ServerId.Type type : ServerId.Type.values()) {

0 commit comments

Comments
 (0)