2121import static com .google .common .base .Suppliers .memoize ;
2222import static org .apache .accumulo .core .metrics .MetricsInfo .QUEUE_TAG_KEY ;
2323import static org .apache .accumulo .monitor .next .SystemInformation .MessageCategory .Configuration ;
24+ import static org .apache .accumulo .monitor .next .SystemInformation .MessageCategory .Resource ;
2425import static org .apache .accumulo .monitor .next .SystemInformation .MessageCategory .Table ;
2526import static org .apache .accumulo .monitor .next .SystemInformation .MessagePriority .Critical ;
2627import static org .apache .accumulo .monitor .next .SystemInformation .MessagePriority .High ;
@@ -376,7 +377,7 @@ public enum MessagePriority {
376377 }
377378
378379 public enum MessageCategory {
379- Configuration , Table ;
380+ Configuration , Resource , Table ;
380381 }
381382
382383 private static final Logger LOG = LoggerFactory .getLogger (SystemInformation .class );
@@ -493,7 +494,7 @@ public void clear() {
493494 serverMetricsView .clear ();
494495 }
495496
496- private void addMessage (MessagePriority pri , MessageCategory cat , String msg ) {
497+ public void addMessage (MessagePriority pri , MessageCategory cat , String msg ) {
497498 messages .computeIfAbsent (pri , k -> new EnumMap <>(MessageCategory .class ))
498499 .computeIfAbsent (cat , k -> new TreeSet <>()).add (msg );
499500 }
@@ -676,10 +677,6 @@ public void processTabletInformation(TableId tableId, String tableName, TabletIn
676677 tablets .computeIfAbsent (tableId , (t ) -> Collections .synchronizedList (new ArrayList <>()))
677678 .add (sti );
678679 tables .computeIfAbsent (tableId , (t ) -> new TableSummary (tableName )).addTablet (sti );
679- if (sti .getEstimatedEntries () == 0 ) {
680- addMessage (Info , Table , "Tablet " + sti .getTabletId ().toString () + " (tid: "
681- + sti .getTabletId ().getTable () + ") may have zero entries and could be merged." );
682- }
683680 }
684681
685682 public void processError (ServerId server ) {
@@ -703,13 +700,50 @@ public void addConfiguredCompactionGroups(Set<String> groups) {
703700 configuredCompactionResourceGroups .addAll (groups );
704701 }
705702
706- public void finish () {
707- // Update the deployment not-responded numbers based
708- // on metric fetch failures for this refresh.
709- metricProblemHosts .forEach (serverId -> {
710- deployment .computeIfAbsent (serverId .getResourceGroup (), g -> new ConcurrentHashMap <>())
711- .computeIfAbsent (serverId .getType (), t -> new ProcessSummary ()).addNotResponded (serverId );
703+ private void computeMessages () {
704+
705+ if (managers .isEmpty ()) {
706+ addMessage (Critical , Resource , "No Managers are running" );
707+ }
708+
709+ if (gc .get () == null ) {
710+ addMessage (Critical , Resource , "Garbage Collector is not running" );
711+ }
712+
713+ if (problemHosts .size () > 0 ) {
714+ addMessage (Info , Resource , "Monitor has not recevied a response from " + problemHosts .size ()
715+ + " servers in the last 10 minutes" );
716+ }
717+
718+ if (metricProblemHosts .size () > 0 ) {
719+ addMessage (Info , Resource ,
720+ "Unable to gather information from " + metricProblemHosts .size () + " servers" );
721+ }
722+
723+ for (ResourceGroupId rg : ctx .resourceGroupOperations ().list ()) {
724+ if (rg == ResourceGroupId .DEFAULT ) {
725+ continue ;
726+ }
727+ if (!compactors .containsKey (rg .canonical ()) && !sservers .containsKey (rg .canonical ())
728+ && !tservers .containsKey (rg .canonical ())) {
729+ addMessage (Info , Configuration , "Resource Group " + rg
730+ + " exists, but no resources assigned. Consider removing the resource group" );
731+ }
732+ }
733+
734+ tablets .forEach ((tid , tablets ) -> {
735+ int empty = 0 ;
736+ for (TabletInformation tablet : tablets ) {
737+ if (tablet .getEstimatedEntries () == 0 ) {
738+ empty ++;
739+ }
740+ }
741+ if (empty > 0 ) {
742+ addMessage (Info , Table ,
743+ "Table " + tid + " may have " + empty + " tablets that could be merged." );
744+ }
712745 });
746+
713747 for (SystemTables table : SystemTables .values ()) {
714748 TableConfiguration tconf = this .ctx .getTableConfiguration (table .tableId ());
715749 String balancerRG = tconf .get (TableLoadBalancer .TABLE_ASSIGNMENT_GROUP_PROPERTY );
@@ -721,6 +755,28 @@ public void finish() {
721755 }
722756 }
723757
758+ FMetric flatbuffer = new FMetric ();
759+ long serversWithZombieScans = 0 ;
760+ for (Entry <ServerId ,MetricResponse > e : allMetrics .asMap ().entrySet ()) {
761+ ServerId sid = e .getKey ();
762+ List <ByteBuffer > metrics = e .getValue ().metrics ;
763+ if (sid .getType () == ServerId .Type .SCAN_SERVER
764+ || sid .getType () == ServerId .Type .TABLET_SERVER ) {
765+ for (ByteBuffer binary : metrics ) {
766+ flatbuffer = FMetric .getRootAsFMetric (binary , flatbuffer );
767+ if (flatbuffer .name ().equals (Metric .SCAN_ZOMBIE_THREADS .getName ())) {
768+ if (getMetricValue (flatbuffer ).longValue () > 0 ) {
769+ serversWithZombieScans ++;
770+ }
771+ }
772+ }
773+ }
774+ }
775+ if (serversWithZombieScans > 0 ) {
776+ addMessage (High , Resource ,
777+ "There are " + serversWithZombieScans + " servers with zombie scan threads" );
778+ }
779+
724780 for (String rg : getResourceGroups ()) {
725781 Set <ServerId > rgCompactors = getCompactorResourceGroupServers (rg );
726782 List <FMetric > metrics = queueMetrics .get (rg );
@@ -749,7 +805,7 @@ public void finish() {
749805 if (idleMetric .isPresent ()) {
750806 var metric = idleMetric .orElseThrow ().getValue ();
751807 if (metric .max () == 1.0D ) {
752- addMessage (High , Configuration ,
808+ addMessage (High , Resource ,
753809 "Compactor group " + rg + " has queued jobs and idle compactors." );
754810 }
755811 }
@@ -766,6 +822,18 @@ public void finish() {
766822 }
767823 }
768824
825+ }
826+
827+ public void finish () {
828+ // Update the deployment not-responded numbers based
829+ // on metric fetch failures for this refresh.
830+ metricProblemHosts .forEach (serverId -> {
831+ deployment .computeIfAbsent (serverId .getResourceGroup (), g -> new ConcurrentHashMap <>())
832+ .computeIfAbsent (serverId .getType (), t -> new ProcessSummary ()).addNotResponded (serverId );
833+ });
834+
835+ computeMessages ();
836+
769837 timestamp .set (System .currentTimeMillis ());
770838 componentStatuses .clear ();
771839 for (final ServerId .Type type : ServerId .Type .values ()) {
0 commit comments