|
| 1 | +package kamon.instrumentation |
| 2 | +package akka.instrumentations |
| 3 | + |
| 4 | +import _root_.akka.actor.{Actor, Address, ExtendedActorSystem, Props} |
| 5 | +import _root_.akka.cluster.{Cluster, ClusterEvent, MemberStatus} |
| 6 | +import kamon.Kamon |
| 7 | +import kamon.instrumentation.akka.AkkaInstrumentation |
| 8 | +import kamon.metric.Gauge |
| 9 | +import kamon.tag.TagSet |
| 10 | +import kanela.agent.api.instrumentation.InstrumentationBuilder |
| 11 | +import kanela.agent.libs.net.bytebuddy.asm.Advice |
| 12 | + |
| 13 | +import scala.collection.mutable |
| 14 | + |
| 15 | +class ClusterInstrumentation extends InstrumentationBuilder with VersionFiltering { |
| 16 | + |
| 17 | + onAkka("2.5", "2.6") { |
| 18 | + onType("akka.cluster.Cluster$") |
| 19 | + .advise(method("createExtension").and(takesArguments(1)), AfterClusterInitializationAdvice) |
| 20 | + } |
| 21 | +} |
| 22 | + |
| 23 | +object AfterClusterInitializationAdvice { |
| 24 | + |
| 25 | + @Advice.OnMethodExit |
| 26 | + def onClusterExtensionCreated(@Advice.Argument(0) system: ExtendedActorSystem, @Advice.Return clusterExtension: Cluster): Unit = { |
| 27 | + val settings = AkkaInstrumentation.settings() |
| 28 | + if(settings.exposeClusterMetrics) { |
| 29 | + val stateExporter = system.systemActorOf(Props[ClusterInstrumentation.ClusterStateExporter], "kamon-cluster-state-exporter") |
| 30 | + clusterExtension.subscribe(stateExporter, classOf[ClusterEvent.ClusterDomainEvent]) |
| 31 | + } |
| 32 | + } |
| 33 | +} |
| 34 | + |
| 35 | +object ClusterInstrumentation { |
| 36 | + |
| 37 | + class ClusterStateExporter extends Actor { |
| 38 | + private val clusterExtension = Cluster(context.system) |
| 39 | + private val clusterTags = TagSet.of("akka.system.name", context.system.name) |
| 40 | + |
| 41 | + private val joiningMembers = ClusterMembersJoining.withTags(clusterTags) |
| 42 | + private val weaklyUpMembers = ClusterMembersWeaklyUp.withTags(clusterTags) |
| 43 | + private val upMembers = ClusterMembersUp.withTags(clusterTags) |
| 44 | + private val leavingMembers = ClusterMembersLeaving.withTags(clusterTags) |
| 45 | + private val exitingMembers = ClusterMembersExiting.withTags(clusterTags) |
| 46 | + private val downMembers = ClusterMembersDown.withTags(clusterTags) |
| 47 | + private val removedMembers = ClusterMembersRemoved.withTags(clusterTags) |
| 48 | + private val totalMembers = ClusterMembersTotal.withTags(clusterTags) |
| 49 | + private val unreachableMembers = ClusterMembersUnreachable.withTags(clusterTags) |
| 50 | + private val unreachableDatacenters = ClusterDatacentersUnreachable.withTags(clusterTags) |
| 51 | + private val monitoredNodes = mutable.HashMap.empty[Address, (Gauge, Gauge)] |
| 52 | + |
| 53 | + override def receive: Receive = { |
| 54 | + case _: ClusterEvent.ClusterDomainEvent => updateAllStates(clusterExtension.state) |
| 55 | + case initialState: ClusterEvent.CurrentClusterState => updateAllStates(initialState) |
| 56 | + } |
| 57 | + |
| 58 | + private def updateAllStates(clusterState: ClusterEvent.CurrentClusterState): Unit = { |
| 59 | + val membersPerStatus = clusterState.members.groupBy(_.status) |
| 60 | + joiningMembers.update(membersPerStatus.getOrElse(MemberStatus.Joining, Set.empty).size) |
| 61 | + weaklyUpMembers.update(membersPerStatus.getOrElse(MemberStatus.WeaklyUp, Set.empty).size) |
| 62 | + upMembers.update(membersPerStatus.getOrElse(MemberStatus.Up, Set.empty).size) |
| 63 | + leavingMembers.update(membersPerStatus.getOrElse(MemberStatus.Leaving, Set.empty).size) |
| 64 | + exitingMembers.update(membersPerStatus.getOrElse(MemberStatus.Exiting, Set.empty).size) |
| 65 | + downMembers.update(membersPerStatus.getOrElse(MemberStatus.Down, Set.empty).size) |
| 66 | + |
| 67 | + val removedMembersCount = membersPerStatus.getOrElse(MemberStatus.Removed, Set.empty).size |
| 68 | + val totalMembersCount = clusterState.members.size - removedMembersCount |
| 69 | + removedMembers.update(removedMembersCount) |
| 70 | + totalMembers.update(totalMembersCount) |
| 71 | + |
| 72 | + unreachableMembers.update(clusterState.unreachable.size) |
| 73 | + unreachableDatacenters.update(clusterState.unreachableDataCenters.size) |
| 74 | + |
| 75 | + // The status and reachability gauges will only be published for the subset of members that are currently being |
| 76 | + // monitored by this node. |
| 77 | + val currentlyMonitoredMembers = clusterState.members.filter(m => clusterExtension.failureDetector.isMonitoring(m.address)) |
| 78 | + val currentlyMonitoredAddresses = currentlyMonitoredMembers.map { member => |
| 79 | + val (statusGauge, reachabilityGauge) = monitoredNodes.getOrElseUpdate(member.address, { |
| 80 | + val memberTags = clusterTags.withTag("member", member.address.toString) |
| 81 | + |
| 82 | + ( |
| 83 | + ClusterMemberStatus.withTags(memberTags), |
| 84 | + ClusterMemberReachability.withTags(memberTags) |
| 85 | + ) |
| 86 | + }) |
| 87 | + |
| 88 | + statusGauge.update(statusToGaugeValue(member.status)) |
| 89 | + reachabilityGauge.update(if(clusterState.unreachable(member)) 1D else 0D) |
| 90 | + member.address |
| 91 | + } |
| 92 | + |
| 93 | + // Remove any cached Gauges for members that we might not be monitoring anymore |
| 94 | + monitoredNodes.keys.filterNot(a => currentlyMonitoredAddresses(a)).foreach { addressToRemove => |
| 95 | + monitoredNodes.remove(addressToRemove).foreach { |
| 96 | + case (statusGauge, reachabilityGauge) => |
| 97 | + statusGauge.remove() |
| 98 | + reachabilityGauge.remove() |
| 99 | + } |
| 100 | + } |
| 101 | + } |
| 102 | + |
| 103 | + private def statusToGaugeValue(memberStatus: MemberStatus): Double = memberStatus match { |
| 104 | + case MemberStatus.Joining => 1 |
| 105 | + case MemberStatus.WeaklyUp => 2 |
| 106 | + case MemberStatus.Up => 3 |
| 107 | + case MemberStatus.Leaving => 4 |
| 108 | + case MemberStatus.Exiting => 5 |
| 109 | + case MemberStatus.Down => 6 |
| 110 | + case MemberStatus.Removed => 7 |
| 111 | + case _ => 0 // This should never happen, but covering the bases here |
| 112 | + } |
| 113 | + } |
| 114 | + |
| 115 | + val ClusterMembersJoining = Kamon.gauge( |
| 116 | + name = "akka.cluster.members.joining.count", |
| 117 | + description = "Tracks the number of cluster members in the Joining state" |
| 118 | + ) |
| 119 | + |
| 120 | + val ClusterMembersWeaklyUp = Kamon.gauge( |
| 121 | + name = "akka.cluster.members.weakly-up.count", |
| 122 | + description = "Tracks the number of cluster members in the Weakly-Up state" |
| 123 | + ) |
| 124 | + |
| 125 | + val ClusterMembersUp = Kamon.gauge( |
| 126 | + name = "akka.cluster.members.up.count", |
| 127 | + description = "Tracks the number of cluster members in the Up state" |
| 128 | + ) |
| 129 | + |
| 130 | + val ClusterMembersLeaving = Kamon.gauge( |
| 131 | + name = "akka.cluster.members.leaving.count", |
| 132 | + description = "Tracks the number of cluster members in the Leaving state" |
| 133 | + ) |
| 134 | + |
| 135 | + val ClusterMembersExiting = Kamon.gauge( |
| 136 | + name = "akka.cluster.members.exiting.count", |
| 137 | + description = "Tracks the number of cluster members in the Exiting state" |
| 138 | + ) |
| 139 | + |
| 140 | + val ClusterMembersDown = Kamon.gauge( |
| 141 | + name = "akka.cluster.members.down.count", |
| 142 | + description = "Tracks the number of cluster members in the Down state" |
| 143 | + ) |
| 144 | + |
| 145 | + val ClusterMembersRemoved = Kamon.gauge( |
| 146 | + name = "akka.cluster.members.removed.count", |
| 147 | + description = "Tracks the number of cluster members in the Removed state" |
| 148 | + ) |
| 149 | + |
| 150 | + val ClusterMembersTotal = Kamon.gauge( |
| 151 | + name = "akka.cluster.members.total.count", |
| 152 | + description = "Tracks the total number of cluster members, without including Removed members" |
| 153 | + ) |
| 154 | + |
| 155 | + val ClusterMembersUnreachable = Kamon.gauge( |
| 156 | + name = "akka.cluster.members.unreachable.count", |
| 157 | + description = "Tracks the total number of cluster members marked as unreachable" |
| 158 | + ) |
| 159 | + |
| 160 | + val ClusterDatacentersUnreachable = Kamon.gauge( |
| 161 | + name = "akka.cluster.datacenters.unreachable.count", |
| 162 | + description = "Tracks the total number of cluster members marked as unreachable" |
| 163 | + ) |
| 164 | + |
| 165 | + val ClusterMemberStatus = Kamon.gauge( |
| 166 | + name = "akka.cluster.members.status", |
| 167 | + description = "Tracks the current status of all monitored nodes by a cluster member" |
| 168 | + ) |
| 169 | + |
| 170 | + val ClusterMemberReachability = Kamon.gauge( |
| 171 | + name = "akka.cluster.members.reachability", |
| 172 | + description = "Tracks the current reachability status of all monitored nodes by a cluster member" |
| 173 | + ) |
| 174 | +} |
0 commit comments