From d77895370b19f747cac65ceea55b6bd7e8e4de01 Mon Sep 17 00:00:00 2001 From: Abhinav Agarwalla Date: Mon, 9 Feb 2026 19:18:15 +0530 Subject: [PATCH 01/16] feat: ping latency metrics --- examples/metrics/__init__.py | 0 examples/metrics/coordinator.py | 78 ++++++++++++++++++ examples/metrics/data/queries.active | Bin 0 -> 20001 bytes examples/metrics/data/wal/00000000 | Bin 0 -> 32768 bytes examples/metrics/prometheus.yml | 9 ++ examples/metrics/runner.py | 58 +++++++++++++ libp2p/__init__.py | 18 +++- libp2p/abc.py | 5 ++ libp2p/host/basic_host.py | 9 +- libp2p/host/ping.py | 37 ++++++++- libp2p/metrics/_init__.py | 0 libp2p/metrics/metrics.py | 29 +++++++ libp2p/metrics/ping.py | 37 +++++++++ libp2p/network/connection/swarm_connection.py | 4 +- libp2p/network/stream/net_stream.py | 9 +- libp2p/network/swarm.py | 7 ++ pyproject.toml | 2 + 17 files changed, 292 insertions(+), 10 deletions(-) create mode 100644 examples/metrics/__init__.py create mode 100644 examples/metrics/coordinator.py create mode 100644 examples/metrics/data/queries.active create mode 100644 examples/metrics/data/wal/00000000 create mode 100644 examples/metrics/prometheus.yml create mode 100644 examples/metrics/runner.py create mode 100644 libp2p/metrics/_init__.py create mode 100644 libp2p/metrics/metrics.py create mode 100644 libp2p/metrics/ping.py diff --git a/examples/metrics/__init__.py b/examples/metrics/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/metrics/coordinator.py b/examples/metrics/coordinator.py new file mode 100644 index 000000000..bfe829fd8 --- /dev/null +++ b/examples/metrics/coordinator.py @@ -0,0 +1,78 @@ +import multiaddr +import trio + +from libp2p import new_host +from libp2p.host.ping import ( + ID as PING_ID, + PingService, + handle_ping, +) +from libp2p.peer.peerinfo import info_from_p2p_addr + +COMMANDS = """ +Available commands: +- connect - Connect to another peer +- ping - Ping to another peer +- local - List local multiaddr +- help - List the existing commands +- exit - Shut down +""" + + +class Node: + def __init__(self, listen_addrs: list[multiaddr.Multiaddr]): + # Create a libp2p-host + self.host = new_host(listen_addrs=listen_addrs, enable_metrics=True) + + # Setup PING service + self.host.set_stream_handler(PING_ID, handle_ping) + self.ping_service = PingService(self.host) + + # CLI input send/receive channels + self.input_send_channel, self.input_receive_channel = trio.open_memory_channel( + 100 + ) + + self.termination_event = trio.Event() + + async def command_executor(self, nursery): + print("Starting command executor loop...") + + async with self.input_receive_channel: + async for parts in self.input_receive_channel: + try: + if not parts: + continue + cmd = parts[0].lower() + + if cmd == "connect" and len(parts) > 1: + maddr = multiaddr.Multiaddr(parts[1]) + info = info_from_p2p_addr(maddr) + + await self.host.connect(info) + print("Connected to {info.peer_id}") + + if cmd == "ping" and len(parts) > 1: + maddr = multiaddr.Multiaddr(parts[1]) + info = info_from_p2p_addr(maddr) + + await self.host.connect(info) + await self.ping_service.ping(info.peer_id, int(parts[2])) + + # Then the rtts will be fed to the prometheus-metrics + + if cmd == "local": + maddr = self.host.get_addrs()[0] + print(maddr) + + if cmd == "help": + print(COMMANDS) + + if cmd == "exit": + print("Exiting...") + self.termination_event.set() + nursery.cancel_scope.cancel() # Stops all tasks + raise KeyboardInterrupt + + except Exception as e: + print(f"Error executing command {parts}: {e}") diff --git a/examples/metrics/data/queries.active b/examples/metrics/data/queries.active new file mode 100644 index 0000000000000000000000000000000000000000..8bfef0eabd49630443efda04a88fc8fb2b6b67bd GIT binary patch literal 20001 zcmeIuu>b%700OW)^Y=)jV?$Vj0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEj zFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r H3|tHh6@UO- literal 0 HcmV?d00001 diff --git a/examples/metrics/data/wal/00000000 b/examples/metrics/data/wal/00000000 new file mode 100644 index 0000000000000000000000000000000000000000..f80ba9d8bcced89ed0e8c7e2ccd19a5cccc600ea GIT binary patch literal 32768 zcmeI)2~<;88UWz?Ab}(VNib0(P;mi40a+AL2Z>VLpdz5MBq78QHA)tfprF>`+Ht9! zR%=xhM5wk?rJ_ZVR$HSFORY7U6k4UqDNUzTq)pcuO+yETgaiz+ouQk;(KDmC zz6!mbJ)Hq;E~)_|fVEc*fGuo-z*6`JL!<^CqlOWTbggeFS1~0`RUbxphXH2229$*p zz>a(!O#pk52Dl6v4gU>%S7UuB;T;NC$CRKf2m;7gZ-T*LrGE|_e7y&%wNrJS)Ea%7 zN~6-6R2x|K;S*Hza>{K=m>R_loq-5qYSdbt!6qP(qgR-e)6-P5RcY{V_On$6BYXq< zAisb>_*-*|;b1FVrFYdEbV`-cD4(r1n9>z#a*ax(Gi1uAWSUgQoMpf<9&knRz>Od8 zYET)~sqkIv698}&C4lby1ThSd!6Y}SVR(%yrB0h#wiAF(qDer+pTtt?)A{2Wa#LPV zDM0`y(L})GPZa3%Dy@9#;amWCqDbJxk7Q{S89$5zfS>rQkAq&VohF}>uAHeds}+ZlValVn(ox>6#oD8^>@U5`_aUGrXJ9pi;o+p>#9=-9@8W zHtG7(Pyk#+pV9*<}FSL@~gF zJyICM((8OwEn>w^V6_c#QN{e6tt3$uw>PQ{mHABWAn=G+;GRzy206lqSmA*m$vhb z2e8(H>-xTk)dP<8Z8b2@%h+S`-wW_?Ku^O>Pl|k$F)aIH)T6wVMEG}VkfSq}? zM@vrD^O6IpTv&3%Y*=}m$;#vKyy`f-(E3}9PV6}h;3^!&C94gyFBiB2vEw{$FCZ4M z1)z_R13clE7yRl6zkFZ~^8@0X`OHvYS!iTiPkiF&R*4m|Y7X5*gel>a5*h+LvWm{% z#|qCayMGxG8c$G4m=bv~^!?0dSmE)8fqf9+yK$5f%7c#S{x!S;C%p9LP)|N0+;|ZR zcMflLKRkr(hXQi;H=u&xHg`mVodPU+;te=Uc%vj1jypqMjyo*(Ki?m{$k8>aY2e(n z;laX1ugngrZ(J*~LzIP6t)o;E?1_ldJh&FT9CCQ%kmJebkYkP?oF`+(oMP=OV~(_N zu3zKPEL=9L-yZw)7sUC^EebkiX*&cf!ZNVV;N>F&5a+*)6m+I|x^-O<|4*#*@#C;F zi1S7h1)Y+KC8PGvOTs$&7Y}zqoKF{1(3$K7Hoaq-gLM-2GZtDpH}V`Q=#<8&zD$48 z8S9)d|BHCU*?V2PIUgQ=U+VR!Eyi2Pl}Ez<-_5`}7xbObgg6JgQP3$(t>`rF7=FC{ zZIk9C;vAPxL8qi*b=J)|^De^6&7qgO2-WV6Gmvp;W3nFwol^Hc-vo(GT&=)){NF6+@wFS^X7xtP#R}K` zFU;i4tCf5oqwOF=ORJUIIfny~S^UO>l~z+CtCgFucESRR^v(o!EDx?$B!`$nULPQq zX2G2(PxuOVOpu+Ds|KbwvQr|N9l2JLd)S=S#1~ugFGt57DTyafKRsuI^@arT znTS_Mo}IQ&e%szF3Hyw3pkX-H%UZB+1>#-oNnx*aN$u3yhUW9)7UowsGOme{U5e~= zg;w(-d;B+_=htG7pPV-Tz}F9DHaRQLR1%l}S~1>{lAtV2u( za&=^!0MD%9>d3eSzQWa!aTh$h{=@P}01uL%L}d}?^%0M}vIO|%J4>?0@G;&)R*(Z0 z;mHE8oL|)+pB~Re&^6^0M<-=}n>VfDAQoMBzu%Hx2)dCxd~Kgho3BL7f@PQm?+LII zUnDy4|4Vd3m#qosuj0+a^Xqt)=1mS&)HX}AK8mV+d$;*rDs*{nE!B$#WoiP)lKRiceL~W7(f#^L%&5ybI$H^C=eyE_wa(WuF%Q=|0`fJa~uQC7k^MfCWn?Sh?%xiDfMntUG~&-`Rt(6Uj5&ucx=B zJevzv%IgbO*ozw$nXl|wHUz=Tk}ua1=20xSpt)S-r8bwVt(@j^wN(ILz2MclF0gDp zzxxDB)(EfNKNCHWEj?+VV#?(2)3M3EbEWA{Iil8(C%TkYOX_PbS3Ii0tCvKL(IV=F zov5ysRZmlXV7h=;$K)3GL)2^6P+gsDe_jzkX)IoST`?O~e@dSHQu+y(ZyCS#y>h)tUPDFD36?qa&X?4=d$;C%Uug9u??^pH1azuT)ztz;pqNBllAS1QHC_XFjLbAu+ zM7&!ZA<979SN2ffEpghi?0x}$s`5L(N|KAX9}T0vTYB5ArYPtae7@c7wYx3do7vGs<1xIMk`?())td?erTSKG}E?|Q=v-&Uw&Ne5Qv+>6P@yR&Ad zY(m^KXHwrSS>CsY#-PNzqZ8I3+YFl*-0uKx&sl{>8bk5!M_;SskbHkQkNR%efjgB? zcYT3(?>MpsxkS&QetR$8{iujP0Lk~|Z0fsZbAnS#WUra0mKQbsgt#B{roKD*KQ8GHvcvHC zKFKXFwshy%f8K6x^1AZqLMzN$yCpMSZalf(h<9hK&6$U|=T}kRE!9MN&&(>oyQjK6 zo`bmmp`^ZBwmIjMz2)W$?=AQ50fB-wh&%rl_1(!muG~79VZQj@vU|gw%8wBDK4S-P z8-AYP=KPxHyeOw23UU9?)B)U@ag|FWUbC;*x9$mYl_lrm;&yYBKf@*etQF>|l5HN> zRrmDxaV@@Pkpao~JqPN$Wh+Lnk1kH~A@BskbZOzxYXHZFV_N-lwM`bhZHxf0GlA z9Eatuj_d&N4LIN=JZ>038830(1oE2+$FrBS1%hjsP72Is$YA=m^jepd&y>fQ|qi0XhP7 q1n3CR5uhVLM}Uq19RWH5bOh)K&=H^`Ku3U%038830(1m05%@3p! None: + # Create a libp2p-node instance + listen_addrs = get_available_interfaces(0) + node = Node(listen_addrs=listen_addrs) + + async with ( + node.host.run(listen_addrs=listen_addrs), + trio.open_nursery() as nursery, + ): + # Start metrics service + metrics = Metrics() + + nursery.start_soon( + metrics.start_prometheus_server, node.host.metric_recv_channel + ) + nursery.start_soon(node.host.get_peerstore().start_cleanup_task, 60) + nursery.start_soon(node.command_executor, nursery) + + print(f"Host multiaddr: {node.host.get_addrs()[0]}") + await trio.sleep(1) + + print("Entering intractive mode, type commands below.") + promt_session = PromptSession() + print(COMMANDS) + + while not node.termination_event.is_set(): + try: + _ = await trio.to_thread.run_sync(input) + user_input = await trio.to_thread.run_sync( + lambda: promt_session.prompt("Command> ") + ) + cmds = user_input.strip().split(" ", 2) + await node.input_send_channel.send(cmds) + + except Exception as e: + print(f"Error in the interactive shell: {e}") + await trio.sleep(1) + + print("Shutdown complete, Goodbye!") + + +def cli() -> None: + try: + trio.run(main) + except* KeyboardInterrupt: + print("Session terminated by user") + + +if __name__ == "__main__": + cli() diff --git a/libp2p/__init__.py b/libp2p/__init__.py index 01d863257..f059c27ee 100644 --- a/libp2p/__init__.py +++ b/libp2p/__init__.py @@ -5,6 +5,8 @@ import logging from pathlib import Path import ssl + +import trio from libp2p.transport.quic.utils import is_quic_multiaddr from typing import Any from cryptography.hazmat.primitives.asymmetric import ed25519 @@ -293,7 +295,8 @@ def new_swarm( tls_client_config: ssl.SSLContext | None = None, tls_server_config: ssl.SSLContext | None = None, resource_manager: ResourceManager | None = None, - psk: str | None = None + psk: str | None = None, + metric_send_channel: trio.MemorySendChannel | None = None ) -> INetworkService: logger.debug(f"new_swarm: enable_quic={enable_quic}, listen_addrs={listen_addrs}") """ @@ -438,7 +441,8 @@ def new_swarm( transport, retry_config=retry_config, connection_config=connection_config, - psk=psk + psk=psk, + metric_send_channel=metric_send_channel ) # Set resource manager if provided @@ -468,6 +472,7 @@ def new_host( enable_mDNS: bool = False, enable_upnp: bool = False, enable_autotls: bool = False, + enable_metrics: bool = False, bootstrap: list[str] | None = None, negotiate_timeout: int = DEFAULT_NEGOTIATE_TIMEOUT, enable_quic: bool = False, @@ -511,6 +516,11 @@ def new_host( if not enable_quic and quic_transport_opt is not None: logger.warning(f"QUIC config provided but QUIC not enabled, ignoring QUIC config") + # Metric emit/consume endpoints + metric_send_channel, metric_recv_channel = None, None + if enable_metrics: + metric_send_channel, metric_recv_channel = trio.open_memory_channel(100) + # Enable automatic protection by default: if no resource manager is supplied, # create a default instance so connections/streams are guarded out of the box. if resource_manager is None: @@ -543,7 +553,8 @@ def new_host( tls_client_config=tls_client_config, tls_server_config=tls_server_config, resource_manager=resource_manager, - psk=psk + psk=psk, + metric_send_channel=metric_send_channel ) if disc_opt is not None: @@ -565,6 +576,7 @@ def new_host( enable_upnp=enable_upnp, negotiate_timeout=negotiate_timeout, resource_manager=resource_manager, + metric_recv_channel=metric_recv_channel bootstrap_allow_ipv6=bootstrap_allow_ipv6, bootstrap_dns_timeout=bootstrap_dns_timeout, bootstrap_dns_max_retries=bootstrap_dns_max_retries, diff --git a/libp2p/abc.py b/libp2p/abc.py index 101d2be35..67181b17e 100644 --- a/libp2p/abc.py +++ b/libp2p/abc.py @@ -327,6 +327,7 @@ class INetStream(ReadWriteCloser): """ muxed_conn: IMuxedConn + metric_send_channel: trio.MemorySendChannel | None @abstractmethod def get_protocol(self) -> TProtocol | None: @@ -2207,6 +2208,10 @@ async def upgrade_inbound_connection( """ + @abstractmethod + async def next_event(self) -> None: + """""" + # -------------------------- peer-record interface.py -------------------------- class IPeerRecord(ABC): diff --git a/libp2p/host/basic_host.py b/libp2p/host/basic_host.py index 95f93050b..012a89d0c 100644 --- a/libp2p/host/basic_host.py +++ b/libp2p/host/basic_host.py @@ -183,12 +183,12 @@ def __init__( network: INetworkService, enable_mDNS: bool = False, enable_upnp: bool = False, - enable_autotls: bool = False, bootstrap: list[str] | None = None, default_protocols: OrderedDict[TProtocol, StreamHandlerFn] | None = None, negotiate_timeout: int = DEFAULT_NEGOTIATE_TIMEOUT, resource_manager: ResourceManager | None = None, psk: str | None = None, + metric_recv_channel: trio.MemoryReceiveChannel | None = None, *, bootstrap_allow_ipv6: bool = False, bootstrap_dns_timeout: float = 10.0, @@ -274,6 +274,9 @@ def __init__( self._identified_peers: set[ID] = set() self._network.register_notifee(_IdentifyNotifee(self)) + # Metrics + self.metric_recv_channel = metric_recv_channel + def get_id(self) -> ID: """ :return: peer_id of host @@ -899,6 +902,10 @@ async def connect(self, peer_info: PeerInfo) -> None: # Kick off identify in the background so protocol caching can engage. self._schedule_identify(peer_info.peer_id, reason="connect") + async def next_event(self): + event = await self.metric_recv_channel.receive() + return event + async def _run_identify(self, peer_id: ID) -> None: """ Run identify protocol with a peer to discover supported protocols. diff --git a/libp2p/host/ping.py b/libp2p/host/ping.py index 2c95d5473..eb45ab540 100644 --- a/libp2p/host/ping.py +++ b/libp2p/host/ping.py @@ -25,6 +25,19 @@ logger = logging.getLogger(__name__) +class PingEvent: + peer_id: PeerID + rtts: list[int] | None + failure_error: Exception | None + + def __init__( + self, peer_id: PeerID, rtts: list[int] | None, failure_error: Exception | None + ): + self.peer_id = peer_id + self.rtts = rtts + self.failure_error = failure_error + + async def _handle_ping(stream: INetStream, peer_id: PeerID) -> bool: """ Return a boolean indicating if we expect more pings from the peer at ``peer_id``. @@ -81,13 +94,18 @@ async def _ping(stream: INetStream) -> int: returns integer value rtt - which denotes round trip time for a ping request in ms """ ping_bytes = secrets.token_bytes(PING_LENGTH) - before = time.time() + + start = time.time() await stream.write(ping_bytes) pong_bytes = await stream.read(PING_LENGTH) - rtt = int((time.time() - before) * (10**6)) + end = time.time() + + rtt = int((end - start) * (10**6)) # in microseconds + if ping_bytes != pong_bytes: logger.debug("invalid pong response") raise + return rtt @@ -103,7 +121,20 @@ async def ping(self, peer_id: PeerID, ping_amt: int = 1) -> list[int]: try: rtts = [await _ping(stream) for _ in range(ping_amt)] await stream.close() + + event = PingEvent( + peer_id=peer_id, + rtts=rtts, + failure_error=None, + ) + return rtts - except Exception: + + except Exception as error: await stream.close() + + event = PingEvent(peer_id=peer_id, rtts=None, failure_error=error) raise + + finally: + await stream.metric_send_channel.send(event) diff --git a/libp2p/metrics/_init__.py b/libp2p/metrics/_init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/libp2p/metrics/metrics.py b/libp2p/metrics/metrics.py new file mode 100644 index 000000000..000156406 --- /dev/null +++ b/libp2p/metrics/metrics.py @@ -0,0 +1,29 @@ +from prometheus_client import start_http_server +import trio +from libp2p.host.ping import PingEvent +from libp2p.metrics.ping import PingMetrics +from libp2p.utils.address_validation import find_free_port + + +class Metrics: + ping: PingMetrics + + def __init__(self): + self.ping = PingMetrics() + + async def start_prometheus_server( + self, + metric_recv_channel: trio.MemoryReceiveChannel, + ) -> None: + + free_port = find_free_port() + start_http_server(free_port) + + print(f"Prometheus server started: http://localhost:{free_port}") + + while True: + event = await metric_recv_channel.receive() + + match event: + case PingEvent(): + self.ping.record(event) \ No newline at end of file diff --git a/libp2p/metrics/ping.py b/libp2p/metrics/ping.py new file mode 100644 index 000000000..5e4652f30 --- /dev/null +++ b/libp2p/metrics/ping.py @@ -0,0 +1,37 @@ +from prometheus_client import Counter, Histogram + +from libp2p.host.ping import PingEvent + + +class PingMetrics: + rtt: Histogram + failures: Counter + + def __init__(self): + rtt = Histogram( + "ping", + "round-trip time sending a 'ping' and receiving a 'pong'", + buckets=[400, 500, 600, 700, 800], + ) + + failures = Counter( + "ping_failure", + "FAilure while sending a ping or receiving a ping", + labelnames=["reason", "peer_id"], + ) + + self.rtt = rtt + self.failures = failures + + def record(self, event: PingEvent) -> None: + match event: + case PingEvent(peer_id=_, rtts=list() as rtts, failure_error=None): + print(rtts) + for rtt_us in rtts: + self.rtt.observe(rtt_us) + + case PingEvent(peer_id=_, rtts=None, failure_error=err): + self.failures.labels(reason=type(err).__name__).inc() + + case _: + raise ValueError("Invalid PingEvent state") diff --git a/libp2p/network/connection/swarm_connection.py b/libp2p/network/connection/swarm_connection.py index 1bc1b154b..77e035e0b 100644 --- a/libp2p/network/connection/swarm_connection.py +++ b/libp2p/network/connection/swarm_connection.py @@ -42,6 +42,7 @@ class SwarmConn(INetConn): _direction: Direction _actual_transport_addresses: list[Multiaddr] | None _connection_type: ConnectionType + _metric_send_channel: trio.MemorySendChannel | None = None def __init__( self, @@ -268,8 +269,7 @@ async def _handle_muxed_stream(self, muxed_stream: IMuxedStream) -> None: await self.swarm.notify_closed_stream(net_stream) async def _add_stream(self, muxed_stream: IMuxedStream) -> NetStream: - # - net_stream = NetStream(muxed_stream, self) + net_stream = NetStream(muxed_stream, self, self._metric_send_channel) # Set Stream state to OPEN if the event has already started. # This is to ensure that the new streams created after connection has started # are immediately set to OPEN state. diff --git a/libp2p/network/stream/net_stream.py b/libp2p/network/stream/net_stream.py index 3366dad54..52501b38e 100644 --- a/libp2p/network/stream/net_stream.py +++ b/libp2p/network/stream/net_stream.py @@ -122,9 +122,13 @@ class NetStream(INetStream): muxed_stream: IMuxedStream protocol_id: TProtocol | None + metric_send_channel: trio.MemorySendChannel | None def __init__( - self, muxed_stream: IMuxedStream, swarm_conn: "SwarmConn | None" + self, + muxed_stream: IMuxedStream, + swarm_conn: "SwarmConn | None", + metric_send_channel: trio.MemorySendChannel | None, ) -> None: self.muxed_stream = muxed_stream self.muxed_conn = muxed_stream.muxed_conn @@ -141,6 +145,9 @@ def __init__( # Thread safety for state operations (following AkMo3's approach) self._state_lock = trio.Lock() + # Metrics emit endpoint + self.metric_send_channel = metric_send_channel + def get_protocol(self) -> TProtocol | None: """ :return: protocol id that stream runs on diff --git a/libp2p/network/swarm.py b/libp2p/network/swarm.py index 69a8e8ad3..185626ee4 100644 --- a/libp2p/network/swarm.py +++ b/libp2p/network/swarm.py @@ -125,6 +125,7 @@ def __init__( retry_config: RetryConfig | None = None, connection_config: ConnectionConfig | QUICTransportConfig | None = None, psk: str | None = None, + metric_send_channel: trio.MemorySendChannel | None = None, ): self.self_id = peer_id self.peerstore = peerstore @@ -153,6 +154,9 @@ def __init__( self._resource_manager = None self._stream_semaphore: trio.Semaphore | None = None + # Metrics + self.metric_send_channel = metric_send_channel + # Initialize connection management components self._init_connection_management() @@ -826,6 +830,9 @@ async def upgrade_outbound_raw_conn( pass swarm_conn = await self.add_conn(muxed_conn, direction="outbound") + + swarm_conn._metric_send_channel = self.metric_send_channel + logger.debug("successfully dialed peer %s", peer_id) return swarm_conn diff --git a/pyproject.toml b/pyproject.toml index cd7aa5c68..995b7b5a0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,6 +42,7 @@ dependencies = [ "py-cid>=0.5.0", "pynacl>=1.3.0", "rpcudp>=3.0.0", + "prometheus-client>=0.24.1", "trio-typing>=0.0.4", "trio-websocket>=0.11.0", "trio>=0.26.0", @@ -77,6 +78,7 @@ circuit-relay-demo = "examples.circuit_relay.relay_example:main" tls-demo = "examples.tls.example_tls_server:main" tls-client-demo = "examples.tls.example_tls_client:main" path-handling = "examples.path_handling_demo:main" +metrics-demo = "examples.metrics.runner:cli" oso-health-report = "libp2p.observability.oso.cli:main" [dependency-groups] From f33b836dccd5985d07680e9593fdfe1b03de0dd2 Mon Sep 17 00:00:00 2001 From: Abhinav Agarwalla Date: Thu, 12 Feb 2026 13:59:59 +0530 Subject: [PATCH 02/16] feat: Attached promtheus/grafana services with docker --- .gitignore | 1 + examples/metrics/coordinator.py | 2 -- libp2p/metrics/docker-compose.yml | 18 ++++++++++++++ libp2p/metrics/metrics.py | 41 +++++++++++++++++++++++++------ libp2p/metrics/prometheus.yml | 8 ++++++ libp2p/utils/paths.py | 2 ++ 6 files changed, 62 insertions(+), 10 deletions(-) create mode 100644 libp2p/metrics/docker-compose.yml create mode 100644 libp2p/metrics/prometheus.yml diff --git a/.gitignore b/.gitignore index 525f5696d..592c540b7 100644 --- a/.gitignore +++ b/.gitignore @@ -196,6 +196,7 @@ _build/ # Attack simulation test results tests/security/attack_simulation/results/ libp2p-forge +libp2p-metrics # OSO health report generated outputs reports/*.json diff --git a/examples/metrics/coordinator.py b/examples/metrics/coordinator.py index bfe829fd8..45437e068 100644 --- a/examples/metrics/coordinator.py +++ b/examples/metrics/coordinator.py @@ -59,8 +59,6 @@ async def command_executor(self, nursery): await self.host.connect(info) await self.ping_service.ping(info.peer_id, int(parts[2])) - # Then the rtts will be fed to the prometheus-metrics - if cmd == "local": maddr = self.host.get_addrs()[0] print(maddr) diff --git a/libp2p/metrics/docker-compose.yml b/libp2p/metrics/docker-compose.yml new file mode 100644 index 000000000..f8e360206 --- /dev/null +++ b/libp2p/metrics/docker-compose.yml @@ -0,0 +1,18 @@ +services: + prometheus: + image: prom/prometheus:latest + container_name: prometheus + volumes: + - ./prometheus.yml:/etc/prometheus/prometheus.yml:ro + ports: + - "${PROMETHEUS_PORT}:9090" + extra_hosts: + - "host.docker.internal:host-gateway" + + grafana: + image: grafana/grafana:latest + container_name: grafana + ports: + - "${GRAFANA_PORT}:3000" + depends_on: + - prometheus \ No newline at end of file diff --git a/libp2p/metrics/metrics.py b/libp2p/metrics/metrics.py index 000156406..bc63c542d 100644 --- a/libp2p/metrics/metrics.py +++ b/libp2p/metrics/metrics.py @@ -1,8 +1,22 @@ +import socket + from prometheus_client import start_http_server import trio + from libp2p.host.ping import PingEvent from libp2p.metrics.ping import PingMetrics -from libp2p.utils.address_validation import find_free_port + + +def find_available_port(start_port: int = 8000, host: str = "127.0.0.1") -> int: + port = start_port + + while True: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: + try: + sock.bind((host, port)) + return port + except OSError: + port += 1 class Metrics: @@ -15,15 +29,26 @@ async def start_prometheus_server( self, metric_recv_channel: trio.MemoryReceiveChannel, ) -> None: - - free_port = find_free_port() - start_http_server(free_port) - - print(f"Prometheus server started: http://localhost:{free_port}") + metrics = find_available_port(8000) + prometheus_dashboard = find_available_port(9000) + grafana_dashboard = find_available_port(7000) + + start_http_server(metrics) + + print(f"\nPrometheus metrics visible at: http://localhost:{metrics}") + print( + f"Prometheus dashboard visible at: http://localhost:{prometheus_dashboard}" + ) + print(f"Grafana dashboard visible at: http://localhost:{grafana_dashboard}\n") + + print( + "\nStart prometheus and grafana dashboard, for another terminal: \n" + f"PROMETHEUS_PORT={prometheus_dashboard} GRAFANA_PORT={grafana_dashboard} docker compose up\n" + ) while True: event = await metric_recv_channel.receive() - + match event: case PingEvent(): - self.ping.record(event) \ No newline at end of file + self.ping.record(event) diff --git a/libp2p/metrics/prometheus.yml b/libp2p/metrics/prometheus.yml new file mode 100644 index 000000000..8db336dc5 --- /dev/null +++ b/libp2p/metrics/prometheus.yml @@ -0,0 +1,8 @@ +global: + scrape_interval: 5s + +scrape_configs: + - job_name: "libp2p-python" + static_configs: + - targets: + - "host.docker.internal:8000" \ No newline at end of file diff --git a/libp2p/utils/paths.py b/libp2p/utils/paths.py index 45b5a9cfe..2ab5c8133 100644 --- a/libp2p/utils/paths.py +++ b/libp2p/utils/paths.py @@ -29,6 +29,8 @@ AUTOTLS_CERT_PATH = Path("libp2p-forge/peer1/autotls-cert.pem") AUTOTLS_KEY_PATH = Path("libp2p-forge/peer1/autotls-key.pem") +METRICS_CONFIG_PATH = Path("libp2p-metrics/.config") + def get_temp_dir() -> Path: """ From a3aa040546adb89ee77ef27c9fc12d9fdefa7212 Mon Sep 17 00:00:00 2001 From: Abhinav Agarwalla Date: Fri, 27 Feb 2026 14:41:09 +0530 Subject: [PATCH 03/16] feat: gossipsub metrics infra --- libp2p/metrics/gossipsub.py | 54 +++++++++++++++++++++++++++++++++++++ libp2p/metrics/kad_dht.py | 0 libp2p/metrics/swarm.py | 0 libp2p/pubsub/gossipsub.py | 11 ++++++++ 4 files changed, 65 insertions(+) create mode 100644 libp2p/metrics/gossipsub.py create mode 100644 libp2p/metrics/kad_dht.py create mode 100644 libp2p/metrics/swarm.py diff --git a/libp2p/metrics/gossipsub.py b/libp2p/metrics/gossipsub.py new file mode 100644 index 000000000..369089019 --- /dev/null +++ b/libp2p/metrics/gossipsub.py @@ -0,0 +1,54 @@ +from prometheus_client import Counter, Histogram + +from libp2p.pubsub.gossipsub import GossipsubEvent + + +class GossipsubMetrics: + delivered: Counter + dropped: Counter + validated_fail: Counter + msg_size: Histogram + + def __init__(self): + self.delivered = Counter( + "gossipsub_delivered_total", + "Messages successfully delivered", + labelnames=["topic"], + ) + + self.dropped = Counter( + "gossipsub_dropped_total", + "Messages dropped", + labelnames=["topic", "reason"], + ) + + self.validated_fail = Counter( + "gossipsub_validation_failed_total", + "Messages rejected by validator", + labelnames=["topic", "error"], + ) + + self.msg_size = Histogram( + "gossipsub_message_bytes", + "Message size in bytes", + buckets=[64, 128, 256, 512, 1024, 2048, 4096], + ) + + def record(self, event: GossipsubEvent) -> None: + if event.delivered: + self.delivered.labels(topic=event.topic).inc() + + if event.message_size is not None: + self.msg_size.observe(event.message_size) + + if event.dropped_reason: + self.dropped.labels( + topic=event.topic, + reason=event.dropped_reason, + ).inc() + + if event.validation_error: + self.validated_fail.labels( + topic=event.topic, + error=type(event.validation_error).__name__, + ).inc() \ No newline at end of file diff --git a/libp2p/metrics/kad_dht.py b/libp2p/metrics/kad_dht.py new file mode 100644 index 000000000..e69de29bb diff --git a/libp2p/metrics/swarm.py b/libp2p/metrics/swarm.py new file mode 100644 index 000000000..e69de29bb diff --git a/libp2p/pubsub/gossipsub.py b/libp2p/pubsub/gossipsub.py index 2f3bfeabc..9d2bbf0cd 100644 --- a/libp2p/pubsub/gossipsub.py +++ b/libp2p/pubsub/gossipsub.py @@ -14,6 +14,7 @@ from typing import ( Any, DefaultDict, + Optional, ) import trio @@ -78,6 +79,16 @@ _MAX_PENDING_GRAFT_PRUNE_PER_PEER = 64 +class GossipsubEvent: + peer_id: str + topic: str + + # one of these should be set + message_size: Optional[int] = None + delivered: bool = False + dropped_reason: Optional[str] = None + validation_error: Optional[Exception] = None + class GossipSub(IPubsubRouter, Service): protocols: list[TProtocol] From ee85841b6d7c69294dcb7588d19d09a39ee72a45 Mon Sep 17 00:00:00 2001 From: Abhinav Agarwalla Date: Fri, 27 Feb 2026 15:00:05 +0530 Subject: [PATCH 04/16] feat: dcutr metrics infra --- libp2p/metrics/dcutr.py | 26 ++++++++++++++++++++++++++ libp2p/metrics/metrics.py | 9 +++++++++ 2 files changed, 35 insertions(+) create mode 100644 libp2p/metrics/dcutr.py diff --git a/libp2p/metrics/dcutr.py b/libp2p/metrics/dcutr.py new file mode 100644 index 000000000..c57aa7340 --- /dev/null +++ b/libp2p/metrics/dcutr.py @@ -0,0 +1,26 @@ +from typing import Optional +from prometheus_client import Counter + +class DcutrEvent: + peer_id: str + success: bool + error: Optional[Exception] = None + + +class DcutrMetrics: + events: Counter + + def __init__(self): + self.events = Counter( + "dcutr_events_total", + "Events emitted by the DCUtR behaviour", + labelnames=["event"], + ) + + def record(self, event: DcutrEvent) -> None: + if event.success: + label = "direct_connection_upgrade_succeeded" + else: + label = "direct_connection_upgrade_failed" + + self.events.labels(event=label).inc() \ No newline at end of file diff --git a/libp2p/metrics/metrics.py b/libp2p/metrics/metrics.py index bc63c542d..7e093c229 100644 --- a/libp2p/metrics/metrics.py +++ b/libp2p/metrics/metrics.py @@ -4,7 +4,10 @@ import trio from libp2p.host.ping import PingEvent +from libp2p.metrics.dcutr import DcutrEvent, DcutrMetrics +from libp2p.metrics.gossipsub import GossipsubMetrics from libp2p.metrics.ping import PingMetrics +from libp2p.pubsub.gossipsub import GossipsubEvent def find_available_port(start_port: int = 8000, host: str = "127.0.0.1") -> int: @@ -24,6 +27,8 @@ class Metrics: def __init__(self): self.ping = PingMetrics() + self.gossipsub = GossipsubMetrics() + self.dcutr = DcutrMetrics() async def start_prometheus_server( self, @@ -52,3 +57,7 @@ async def start_prometheus_server( match event: case PingEvent(): self.ping.record(event) + case GossipsubEvent(): + self.gossipsub.record(event) + case DcutrEvent(): + self.dcutr.record(event) From fb01431ccd60a3b9adf8d6aadc5369f5b30aae02 Mon Sep 17 00:00:00 2001 From: Abhinav Agarwalla Date: Fri, 27 Feb 2026 15:00:56 +0530 Subject: [PATCH 05/16] chore: fix formatting --- libp2p/metrics/dcutr.py | 8 ++++---- libp2p/metrics/docker-compose.yml | 2 +- libp2p/metrics/gossipsub.py | 2 +- libp2p/metrics/prometheus.yml | 2 +- libp2p/pubsub/gossipsub.py | 7 +++---- 5 files changed, 10 insertions(+), 11 deletions(-) diff --git a/libp2p/metrics/dcutr.py b/libp2p/metrics/dcutr.py index c57aa7340..85d27c227 100644 --- a/libp2p/metrics/dcutr.py +++ b/libp2p/metrics/dcutr.py @@ -1,11 +1,11 @@ -from typing import Optional from prometheus_client import Counter + class DcutrEvent: peer_id: str success: bool - error: Optional[Exception] = None - + error: Exception | None = None + class DcutrMetrics: events: Counter @@ -23,4 +23,4 @@ def record(self, event: DcutrEvent) -> None: else: label = "direct_connection_upgrade_failed" - self.events.labels(event=label).inc() \ No newline at end of file + self.events.labels(event=label).inc() diff --git a/libp2p/metrics/docker-compose.yml b/libp2p/metrics/docker-compose.yml index f8e360206..b716126d0 100644 --- a/libp2p/metrics/docker-compose.yml +++ b/libp2p/metrics/docker-compose.yml @@ -15,4 +15,4 @@ services: ports: - "${GRAFANA_PORT}:3000" depends_on: - - prometheus \ No newline at end of file + - prometheus diff --git a/libp2p/metrics/gossipsub.py b/libp2p/metrics/gossipsub.py index 369089019..9daf56557 100644 --- a/libp2p/metrics/gossipsub.py +++ b/libp2p/metrics/gossipsub.py @@ -51,4 +51,4 @@ def record(self, event: GossipsubEvent) -> None: self.validated_fail.labels( topic=event.topic, error=type(event.validation_error).__name__, - ).inc() \ No newline at end of file + ).inc() diff --git a/libp2p/metrics/prometheus.yml b/libp2p/metrics/prometheus.yml index 8db336dc5..524c74dfc 100644 --- a/libp2p/metrics/prometheus.yml +++ b/libp2p/metrics/prometheus.yml @@ -5,4 +5,4 @@ scrape_configs: - job_name: "libp2p-python" static_configs: - targets: - - "host.docker.internal:8000" \ No newline at end of file + - "host.docker.internal:8000" diff --git a/libp2p/pubsub/gossipsub.py b/libp2p/pubsub/gossipsub.py index 9d2bbf0cd..46af9fad0 100644 --- a/libp2p/pubsub/gossipsub.py +++ b/libp2p/pubsub/gossipsub.py @@ -14,7 +14,6 @@ from typing import ( Any, DefaultDict, - Optional, ) import trio @@ -84,10 +83,10 @@ class GossipsubEvent: topic: str # one of these should be set - message_size: Optional[int] = None + message_size: int | None = None delivered: bool = False - dropped_reason: Optional[str] = None - validation_error: Optional[Exception] = None + dropped_reason: str | None = None + validation_error: Exception | None = None class GossipSub(IPubsubRouter, Service): From d42591847a2271fa83a76b868063f3bc3426f900 Mon Sep 17 00:00:00 2001 From: Abhinav Agarwalla Date: Wed, 11 Mar 2026 13:49:01 +0530 Subject: [PATCH 06/16] feat: relay and kad-dht metrics --- libp2p/metrics/kad.py | 171 ++++++++++++++++++++++++++++++++++++++++ libp2p/metrics/relay.py | 62 +++++++++++++++ 2 files changed, 233 insertions(+) create mode 100644 libp2p/metrics/kad.py create mode 100644 libp2p/metrics/relay.py diff --git a/libp2p/metrics/kad.py b/libp2p/metrics/kad.py new file mode 100644 index 000000000..1496d4c18 --- /dev/null +++ b/libp2p/metrics/kad.py @@ -0,0 +1,171 @@ +from prometheus_client import Counter, Histogram + + +class KadMetrics: + """ + Prometheus metrics for the Kademlia behaviour. + Mirrors the Rust libp2p metrics design. + """ + + def __init__(self): + + # ------------------------- + # GetRecord metrics + # ------------------------- + + self.query_result_get_record_ok = Counter( + "kad_query_result_get_record_ok_total", + "Number of records returned by a successful Kademlia get record query", + ) + + self.query_result_get_record_error = Counter( + "kad_query_result_get_record_error_total", + "Number of failed Kademlia get record queries", + labelnames=["error"], + ) + + # ------------------------- + # GetClosestPeers metrics + # ------------------------- + + self.query_result_get_closest_peers_ok = Histogram( + "kad_query_result_get_closest_peers_ok", + "Number of closest peers returned by a successful query", + buckets=(1,2,4,8,16,32,64,128,256,512), + ) + + self.query_result_get_closest_peers_error = Counter( + "kad_query_result_get_closest_peers_error_total", + "Number of failed get closest peers queries", + labelnames=["error"], + ) + + # ------------------------- + # GetProviders metrics + # ------------------------- + + self.query_result_get_providers_ok = Histogram( + "kad_query_result_get_providers_ok", + "Number of providers returned by a successful query", + buckets=(1,2,4,8,16,32,64,128,256,512), + ) + + self.query_result_get_providers_error = Counter( + "kad_query_result_get_providers_error_total", + "Number of failed get providers queries", + labelnames=["error"], + ) + + # ------------------------- + # Query statistics + # ------------------------- + + self.query_result_num_requests = Histogram( + "kad_query_result_num_requests", + "Number of requests started for a Kademlia query", + labelnames=["type"], + buckets=(1,2,4,8,16,32,64,128,256,512), + ) + + self.query_result_num_success = Histogram( + "kad_query_result_num_success", + "Number of successful requests of a Kademlia query", + labelnames=["type"], + buckets=(1,2,4,8,16,32,64,128,256,512), + ) + + self.query_result_num_failure = Histogram( + "kad_query_result_num_failure", + "Number of failed requests of a Kademlia query", + labelnames=["type"], + buckets=(1,2,4,8,16,32,64,128,256,512), + ) + + self.query_result_duration = Histogram( + "kad_query_result_duration_seconds", + "Duration of a Kademlia query", + labelnames=["type"], + buckets=(0.1,0.2,0.4,0.8,1.6,3.2,6.4,12.8,25.6), + ) + + # ------------------------- + # Routing table updates + # ------------------------- + + self.routing_updated = Counter( + "kad_routing_updated_total", + "Peers added, updated, or evicted from routing table", + labelnames=["action", "bucket"], + ) + + # ------------------------- + # inbound requests + # ------------------------- + + self.inbound_requests = Counter( + "kad_inbound_requests_total", + "Number of inbound requests", + labelnames=["request"], + ) + + # ----------------------------------------------------- + + def record_outbound_query(self, query_type, stats): + + self.query_result_num_requests.labels(type=query_type).observe( + stats["num_requests"] + ) + + self.query_result_num_success.labels(type=query_type).observe( + stats["num_success"] + ) + + self.query_result_num_failure.labels(type=query_type).observe( + stats["num_failures"] + ) + + if stats.get("duration") is not None: + self.query_result_duration.labels(type=query_type).observe( + stats["duration"] + ) + + # ----------------------------------------------------- + + def record_get_record_ok(self): + self.query_result_get_record_ok.inc() + + def record_get_record_error(self, error): + self.query_result_get_record_error.labels(error=error).inc() + + # ----------------------------------------------------- + + def record_get_closest_peers_ok(self, peer_count): + self.query_result_get_closest_peers_ok.observe(peer_count) + + def record_get_closest_peers_error(self, error): + self.query_result_get_closest_peers_error.labels(error=error).inc() + + # ----------------------------------------------------- + + def record_get_providers_ok(self, provider_count): + self.query_result_get_providers_ok.observe(provider_count) + + def record_get_providers_error(self, error): + self.query_result_get_providers_error.labels(error=error).inc() + + # ----------------------------------------------------- + + def record_routing_update(self, action, bucket): + + self.routing_updated.labels( + action=action, + bucket=str(bucket), + ).inc() + + # ----------------------------------------------------- + + def record_inbound_request(self, request_type): + + self.inbound_requests.labels( + request=request_type + ).inc() \ No newline at end of file diff --git a/libp2p/metrics/relay.py b/libp2p/metrics/relay.py new file mode 100644 index 000000000..a3895241f --- /dev/null +++ b/libp2p/metrics/relay.py @@ -0,0 +1,62 @@ +from dataclasses import dataclass +from prometheus_client import Counter + + +@dataclass(slots=True) +class RelayEvent: + """ + Event emitted by the relay behaviour. + + Only the event type is required because the metrics layer + simply counts occurrences of each event type. + """ + event_type: str + + +class RelayEventType: + """ + Equivalent of the Rust `EventType` enum. + """ + + RESERVATION_REQ_ACCEPTED = "ReservationReqAccepted" + RESERVATION_REQ_ACCEPT_FAILED = "ReservationReqAcceptFailed" + RESERVATION_REQ_DENIED = "ReservationReqDenied" + RESERVATION_REQ_DENY_FAILED = "ReservationReqDenyFailed" + RESERVATION_CLOSED = "ReservationClosed" + RESERVATION_TIMED_OUT = "ReservationTimedOut" + + CIRCUIT_REQ_DENIED = "CircuitReqDenied" + CIRCUIT_REQ_DENY_FAILED = "CircuitReqDenyFailed" + CIRCUIT_REQ_OUTBOUND_CONNECT_FAILED = "CircuitReqOutboundConnectFailed" + + CIRCUIT_REQ_ACCEPTED = "CircuitReqAccepted" + CIRCUIT_REQ_ACCEPT_FAILED = "CircuitReqAcceptFailed" + + CIRCUIT_CLOSED = "CircuitClosed" + + +class RelayMetrics: + """ + Prometheus metrics for relay behaviour. + + Equivalent to the Rust implementation: + + Family + + which becomes a Counter with labels in the Python Prometheus client. + """ + + events: Counter + + def __init__(self) -> None: + self.events = Counter( + "relay_events_total", + "Events emitted by the relay NetworkBehaviour", + labelnames=["event"], + ) + + def record(self, event: RelayEvent) -> None: + """ + Record a relay event. + """ + self.events.labels(event=event.event_type).inc() \ No newline at end of file From a2a3d969b57f326061e5aa70cbab80970a29bcd2 Mon Sep 17 00:00:00 2001 From: Abhinav Agarwalla Date: Wed, 11 Mar 2026 13:49:40 +0530 Subject: [PATCH 07/16] chore: fix formatting --- libp2p/metrics/kad.py | 20 +++++++------------- libp2p/metrics/relay.py | 4 +++- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/libp2p/metrics/kad.py b/libp2p/metrics/kad.py index 1496d4c18..b58ab8504 100644 --- a/libp2p/metrics/kad.py +++ b/libp2p/metrics/kad.py @@ -8,7 +8,6 @@ class KadMetrics: """ def __init__(self): - # ------------------------- # GetRecord metrics # ------------------------- @@ -31,7 +30,7 @@ def __init__(self): self.query_result_get_closest_peers_ok = Histogram( "kad_query_result_get_closest_peers_ok", "Number of closest peers returned by a successful query", - buckets=(1,2,4,8,16,32,64,128,256,512), + buckets=(1, 2, 4, 8, 16, 32, 64, 128, 256, 512), ) self.query_result_get_closest_peers_error = Counter( @@ -47,7 +46,7 @@ def __init__(self): self.query_result_get_providers_ok = Histogram( "kad_query_result_get_providers_ok", "Number of providers returned by a successful query", - buckets=(1,2,4,8,16,32,64,128,256,512), + buckets=(1, 2, 4, 8, 16, 32, 64, 128, 256, 512), ) self.query_result_get_providers_error = Counter( @@ -64,28 +63,28 @@ def __init__(self): "kad_query_result_num_requests", "Number of requests started for a Kademlia query", labelnames=["type"], - buckets=(1,2,4,8,16,32,64,128,256,512), + buckets=(1, 2, 4, 8, 16, 32, 64, 128, 256, 512), ) self.query_result_num_success = Histogram( "kad_query_result_num_success", "Number of successful requests of a Kademlia query", labelnames=["type"], - buckets=(1,2,4,8,16,32,64,128,256,512), + buckets=(1, 2, 4, 8, 16, 32, 64, 128, 256, 512), ) self.query_result_num_failure = Histogram( "kad_query_result_num_failure", "Number of failed requests of a Kademlia query", labelnames=["type"], - buckets=(1,2,4,8,16,32,64,128,256,512), + buckets=(1, 2, 4, 8, 16, 32, 64, 128, 256, 512), ) self.query_result_duration = Histogram( "kad_query_result_duration_seconds", "Duration of a Kademlia query", labelnames=["type"], - buckets=(0.1,0.2,0.4,0.8,1.6,3.2,6.4,12.8,25.6), + buckets=(0.1, 0.2, 0.4, 0.8, 1.6, 3.2, 6.4, 12.8, 25.6), ) # ------------------------- @@ -111,7 +110,6 @@ def __init__(self): # ----------------------------------------------------- def record_outbound_query(self, query_type, stats): - self.query_result_num_requests.labels(type=query_type).observe( stats["num_requests"] ) @@ -156,7 +154,6 @@ def record_get_providers_error(self, error): # ----------------------------------------------------- def record_routing_update(self, action, bucket): - self.routing_updated.labels( action=action, bucket=str(bucket), @@ -165,7 +162,4 @@ def record_routing_update(self, action, bucket): # ----------------------------------------------------- def record_inbound_request(self, request_type): - - self.inbound_requests.labels( - request=request_type - ).inc() \ No newline at end of file + self.inbound_requests.labels(request=request_type).inc() diff --git a/libp2p/metrics/relay.py b/libp2p/metrics/relay.py index a3895241f..464f57067 100644 --- a/libp2p/metrics/relay.py +++ b/libp2p/metrics/relay.py @@ -1,4 +1,5 @@ from dataclasses import dataclass + from prometheus_client import Counter @@ -10,6 +11,7 @@ class RelayEvent: Only the event type is required because the metrics layer simply counts occurrences of each event type. """ + event_type: str @@ -59,4 +61,4 @@ def record(self, event: RelayEvent) -> None: """ Record a relay event. """ - self.events.labels(event=event.event_type).inc() \ No newline at end of file + self.events.labels(event=event.event_type).inc() From dc743a9e7f7d88bc6611090140657af612af1d95 Mon Sep 17 00:00:00 2001 From: Abhinav Agarwalla Date: Wed, 11 Mar 2026 13:59:59 +0530 Subject: [PATCH 08/16] feat: swarm metrics and per-protocol inbound and outbound bandwidth usage in prometheus --- libp2p/metrics/bandwidth.py | 90 ++++++++++++++ libp2p/metrics/swarm.py | 238 ++++++++++++++++++++++++++++++++++++ 2 files changed, 328 insertions(+) create mode 100644 libp2p/metrics/bandwidth.py diff --git a/libp2p/metrics/bandwidth.py b/libp2p/metrics/bandwidth.py new file mode 100644 index 000000000..d24118020 --- /dev/null +++ b/libp2p/metrics/bandwidth.py @@ -0,0 +1,90 @@ +import asyncio +from prometheus_client import Counter + + +class BandwidthMetrics: + """ + Prometheus bandwidth metrics for libp2p transport streams. + """ + + def __init__(self): + + self.bandwidth = Counter( + "libp2p_bandwidth_bytes_total", + "Bandwidth usage by direction and protocol stack", + ["direction", "protocols"], + ) + + def outbound(self, protocols, n): + self.bandwidth.labels( + direction="outbound", + protocols=protocols, + ).inc(n) + + def inbound(self, protocols, n): + self.bandwidth.labels( + direction="inbound", + protocols=protocols, + ).inc(n) + + +class InstrumentedStream: + """ + Wraps a stream to measure bandwidth. + """ + + def __init__(self, stream, metrics: BandwidthMetrics, protocols: str): + self.stream = stream + self.metrics = metrics + self.protocols = protocols + + async def read(self, n=-1): + data = await self.stream.read(n) + + if data: + self.metrics.inbound(self.protocols, len(data)) + + return data + + async def write(self, data: bytes): + n = await self.stream.write(data) + + if n is None: + n = len(data) + + self.metrics.outbound(self.protocols, n) + + return n + + async def close(self): + await self.stream.close() + + +class TransportWrapper: + """ + Wraps a transport and instruments bandwidth. + """ + + def __init__(self, transport, metrics: BandwidthMetrics): + self.transport = transport + self.metrics = metrics + + async def dial(self, addr, protocols): + + stream = await self.transport.dial(addr) + + return InstrumentedStream( + stream, + self.metrics, + protocols, + ) + + async def accept(self, protocols): + + stream = await self.transport.accept() + + return InstrumentedStream( + stream, + self.metrics, + protocols, + ) \ No newline at end of file diff --git a/libp2p/metrics/swarm.py b/libp2p/metrics/swarm.py index e69de29bb..0663bd35c 100644 --- a/libp2p/metrics/swarm.py +++ b/libp2p/metrics/swarm.py @@ -0,0 +1,238 @@ +import time +from prometheus_client import Counter, Histogram + + +class SwarmMetrics: + """ + Prometheus metrics for libp2p swarm events. + Mirrors the Rust libp2p metrics implementation. + """ + + def __init__(self): + + # --------------------------- + # incoming connections + # --------------------------- + + self.connections_incoming = Counter( + "swarm_connections_incoming_total", + "Number of incoming connections per address stack", + ["protocols"], + ) + + self.connections_incoming_error = Counter( + "swarm_connections_incoming_error_total", + "Number of incoming connection errors", + ["error", "protocols"], + ) + + # --------------------------- + # connection lifecycle + # --------------------------- + + self.connections_established = Counter( + "swarm_connections_established_total", + "Number of connections established", + ["role", "protocols"], + ) + + self.connections_establishment_duration = Histogram( + "swarm_connections_establishment_duration_seconds", + "Time taken to establish connection", + ["role", "protocols"], + buckets=( + 0.01, 0.02, 0.05, 0.1, + 0.2, 0.5, 1, 2, 5, 10 + ), + ) + + self.connections_duration = Histogram( + "swarm_connections_duration_seconds", + "Time a connection was alive", + ["role", "protocols", "cause"], + buckets=( + 0.01, 0.1, 1, 5, 10, 30, 60, 300, 600 + ), + ) + + # --------------------------- + # listening addresses + # --------------------------- + + self.new_listen_addr = Counter( + "swarm_new_listen_addr_total", + "Number of new listen addresses", + ["protocols"], + ) + + self.expired_listen_addr = Counter( + "swarm_expired_listen_addr_total", + "Number of expired listen addresses", + ["protocols"], + ) + + # --------------------------- + # external addresses + # --------------------------- + + self.external_addr_candidates = Counter( + "swarm_external_addr_candidates_total", + "Number of new external address candidates", + ["protocols"], + ) + + self.external_addr_confirmed = Counter( + "swarm_external_addr_confirmed_total", + "Number of confirmed external addresses", + ["protocols"], + ) + + self.external_addr_expired = Counter( + "swarm_external_addr_expired_total", + "Number of expired external addresses", + ["protocols"], + ) + + # --------------------------- + # listener lifecycle + # --------------------------- + + self.listener_closed = Counter( + "swarm_listener_closed_total", + "Number of listeners closed", + ["protocols"], + ) + + self.listener_error = Counter( + "swarm_listener_error_total", + "Number of listener errors", + ) + + # --------------------------- + # dialing + # --------------------------- + + self.dial_attempt = Counter( + "swarm_dial_attempt_total", + "Number of dial attempts", + ) + + self.outgoing_connection_error = Counter( + "swarm_outgoing_connection_error_total", + "Outgoing connection errors", + ["peer", "error"], + ) + + # --------------------------- + # connection tracking + # --------------------------- + + self.connections = {} + + # ------------------------------------------------- + + def record(self, event): + """ + Record a SwarmEvent-like object. + """ + + etype = event["type"] + + if etype == "ConnectionEstablished": + + role = event["role"] + protocols = event["protocols"] + conn_id = event["connection_id"] + duration = event.get("established_in", 0) + + self.connections_established.labels( + role=role, + protocols=protocols, + ).inc() + + self.connections_establishment_duration.labels( + role=role, + protocols=protocols, + ).observe(duration) + + self.connections[conn_id] = time.time() + + elif etype == "ConnectionClosed": + + conn_id = event["connection_id"] + role = event["role"] + protocols = event["protocols"] + cause = event.get("cause", "None") + + if conn_id in self.connections: + elapsed = time.time() - self.connections.pop(conn_id) + + self.connections_duration.labels( + role=role, + protocols=protocols, + cause=cause, + ).observe(elapsed) + + elif etype == "IncomingConnection": + + self.connections_incoming.labels( + protocols=event["protocols"] + ).inc() + + elif etype == "IncomingConnectionError": + + self.connections_incoming_error.labels( + error=event["error"], + protocols=event["protocols"], + ).inc() + + elif etype == "OutgoingConnectionError": + + self.outgoing_connection_error.labels( + peer=event["peer"], + error=event["error"], + ).inc() + + elif etype == "NewListenAddr": + + self.new_listen_addr.labels( + protocols=event["protocols"] + ).inc() + + elif etype == "ExpiredListenAddr": + + self.expired_listen_addr.labels( + protocols=event["protocols"] + ).inc() + + elif etype == "ListenerClosed": + + self.listener_closed.labels( + protocols=event["protocols"] + ).inc() + + elif etype == "ListenerError": + + self.listener_error.inc() + + elif etype == "Dialing": + + self.dial_attempt.inc() + + elif etype == "NewExternalAddrCandidate": + + self.external_addr_candidates.labels( + protocols=event["protocols"] + ).inc() + + elif etype == "ExternalAddrConfirmed": + + self.external_addr_confirmed.labels( + protocols=event["protocols"] + ).inc() + + elif etype == "ExternalAddrExpired": + + self.external_addr_expired.labels( + protocols=event["protocols"] + ).inc() \ No newline at end of file From 9a2a6a5dfad1bcf5b2a2377d62749112c07331cb Mon Sep 17 00:00:00 2001 From: Abhinav Agarwalla Date: Wed, 11 Mar 2026 14:59:42 +0530 Subject: [PATCH 09/16] chore: fix formatting --- libp2p/metrics/bandwidth.py | 6 +---- libp2p/metrics/swarm.py | 53 +++++++------------------------------ 2 files changed, 11 insertions(+), 48 deletions(-) diff --git a/libp2p/metrics/bandwidth.py b/libp2p/metrics/bandwidth.py index d24118020..dd409f067 100644 --- a/libp2p/metrics/bandwidth.py +++ b/libp2p/metrics/bandwidth.py @@ -1,4 +1,3 @@ -import asyncio from prometheus_client import Counter @@ -8,7 +7,6 @@ class BandwidthMetrics: """ def __init__(self): - self.bandwidth = Counter( "libp2p_bandwidth_bytes_total", "Bandwidth usage by direction and protocol stack", @@ -70,7 +68,6 @@ def __init__(self, transport, metrics: BandwidthMetrics): self.metrics = metrics async def dial(self, addr, protocols): - stream = await self.transport.dial(addr) return InstrumentedStream( @@ -80,11 +77,10 @@ async def dial(self, addr, protocols): ) async def accept(self, protocols): - stream = await self.transport.accept() return InstrumentedStream( stream, self.metrics, protocols, - ) \ No newline at end of file + ) diff --git a/libp2p/metrics/swarm.py b/libp2p/metrics/swarm.py index 0663bd35c..0e307334a 100644 --- a/libp2p/metrics/swarm.py +++ b/libp2p/metrics/swarm.py @@ -1,4 +1,5 @@ import time + from prometheus_client import Counter, Histogram @@ -9,7 +10,6 @@ class SwarmMetrics: """ def __init__(self): - # --------------------------- # incoming connections # --------------------------- @@ -40,19 +40,14 @@ def __init__(self): "swarm_connections_establishment_duration_seconds", "Time taken to establish connection", ["role", "protocols"], - buckets=( - 0.01, 0.02, 0.05, 0.1, - 0.2, 0.5, 1, 2, 5, 10 - ), + buckets=(0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1, 2, 5, 10), ) self.connections_duration = Histogram( "swarm_connections_duration_seconds", "Time a connection was alive", ["role", "protocols", "cause"], - buckets=( - 0.01, 0.1, 1, 5, 10, 30, 60, 300, 600 - ), + buckets=(0.01, 0.1, 1, 5, 10, 30, 60, 300, 600), ) # --------------------------- @@ -135,11 +130,9 @@ def record(self, event): """ Record a SwarmEvent-like object. """ - etype = event["type"] if etype == "ConnectionEstablished": - role = event["role"] protocols = event["protocols"] conn_id = event["connection_id"] @@ -158,7 +151,6 @@ def record(self, event): self.connections[conn_id] = time.time() elif etype == "ConnectionClosed": - conn_id = event["connection_id"] role = event["role"] protocols = event["protocols"] @@ -174,65 +166,40 @@ def record(self, event): ).observe(elapsed) elif etype == "IncomingConnection": - - self.connections_incoming.labels( - protocols=event["protocols"] - ).inc() + self.connections_incoming.labels(protocols=event["protocols"]).inc() elif etype == "IncomingConnectionError": - self.connections_incoming_error.labels( error=event["error"], protocols=event["protocols"], ).inc() elif etype == "OutgoingConnectionError": - self.outgoing_connection_error.labels( peer=event["peer"], error=event["error"], ).inc() elif etype == "NewListenAddr": - - self.new_listen_addr.labels( - protocols=event["protocols"] - ).inc() + self.new_listen_addr.labels(protocols=event["protocols"]).inc() elif etype == "ExpiredListenAddr": - - self.expired_listen_addr.labels( - protocols=event["protocols"] - ).inc() + self.expired_listen_addr.labels(protocols=event["protocols"]).inc() elif etype == "ListenerClosed": - - self.listener_closed.labels( - protocols=event["protocols"] - ).inc() + self.listener_closed.labels(protocols=event["protocols"]).inc() elif etype == "ListenerError": - self.listener_error.inc() elif etype == "Dialing": - self.dial_attempt.inc() elif etype == "NewExternalAddrCandidate": - - self.external_addr_candidates.labels( - protocols=event["protocols"] - ).inc() + self.external_addr_candidates.labels(protocols=event["protocols"]).inc() elif etype == "ExternalAddrConfirmed": - - self.external_addr_confirmed.labels( - protocols=event["protocols"] - ).inc() + self.external_addr_confirmed.labels(protocols=event["protocols"]).inc() elif etype == "ExternalAddrExpired": - - self.external_addr_expired.labels( - protocols=event["protocols"] - ).inc() \ No newline at end of file + self.external_addr_expired.labels(protocols=event["protocols"]).inc() From b86a271fb98d34db492df8c4852697d65b665fc7 Mon Sep 17 00:00:00 2001 From: Abhinav Agarwalla Date: Thu, 19 Mar 2026 19:50:40 +0530 Subject: [PATCH 10/16] feat: integrated gossipsub metrics with cli runtime --- examples/metrics/coordinator.py | 49 +++++++- examples/metrics/data/queries.active | Bin 20001 -> 0 bytes examples/metrics/data/wal/00000000 | Bin 32768 -> 0 bytes examples/metrics/runner.py | 52 +++++---- libp2p/__init__.py | 2 +- libp2p/metrics/dcutr.py | 26 ----- libp2p/metrics/gossipsub.py | 64 ++++++----- libp2p/metrics/kad.py | 165 --------------------------- libp2p/metrics/kad_dht.py | 165 +++++++++++++++++++++++++++ libp2p/metrics/metrics.py | 6 +- libp2p/network/swarm.py | 4 +- libp2p/pubsub/gossipsub.py | 11 -- libp2p/pubsub/pubsub.py | 19 +++ 13 files changed, 300 insertions(+), 263 deletions(-) delete mode 100644 examples/metrics/data/queries.active delete mode 100644 examples/metrics/data/wal/00000000 delete mode 100644 libp2p/metrics/dcutr.py delete mode 100644 libp2p/metrics/kad.py diff --git a/examples/metrics/coordinator.py b/examples/metrics/coordinator.py index 45437e068..e6293bc06 100644 --- a/examples/metrics/coordinator.py +++ b/examples/metrics/coordinator.py @@ -2,17 +2,25 @@ import trio from libp2p import new_host +from libp2p.custom_types import TProtocol from libp2p.host.ping import ( ID as PING_ID, PingService, handle_ping, ) +from libp2p.peer.id import ID from libp2p.peer.peerinfo import info_from_p2p_addr +from libp2p.pubsub.gossipsub import GossipSub +from libp2p.pubsub.pubsub import Pubsub +GOSSIPSUB_PROTOCOL_ID = TProtocol("/meshsub/1.0.0") COMMANDS = """ Available commands: - connect - Connect to another peer - ping - Ping to another peer +- join - Subscribe to a topic +- leave - Unsubscribe to a topic +- publish - Publish a message - local - List local multiaddr - help - List the existing commands - exit - Shut down @@ -28,6 +36,21 @@ def __init__(self, listen_addrs: list[multiaddr.Multiaddr]): self.host.set_stream_handler(PING_ID, handle_ping) self.ping_service = PingService(self.host) + # Set up Pubsub/Gossipsub + self.gossipsub = GossipSub( + protocols=[GOSSIPSUB_PROTOCOL_ID], + degree=3, # Number of peers to maintain in mesh + degree_low=2, # Lower bound for mesh peers + degree_high=4, # Upper bound for mesh peers + direct_peers=None, # Direct peers + time_to_live=60, # TTL for message cache in seconds + gossip_window=2, # Smaller window for faster gossip + gossip_history=5, # Keep more history + heartbeat_initial_delay=2.0, # Start heartbeats sooner + heartbeat_interval=5, # More frequent heartbeats for testing + ) + self.pubsub = Pubsub(self.host, self.gossipsub) + # CLI input send/receive channels self.input_send_channel, self.input_receive_channel = trio.open_memory_channel( 100 @@ -35,6 +58,17 @@ def __init__(self, listen_addrs: list[multiaddr.Multiaddr]): self.termination_event = trio.Event() + async def receive_loop(self, subsription): + print("Starting receive loop") + while not self.termination_event.is_set(): + try: + message = await subsription.get() + print(f"From: {ID(message.from_id).to_base58()}") + print(f"Received: {message.data.decode('utf-8')}") + except Exception: + print("Error in receive loop") + await trio.sleep(1) + async def command_executor(self, nursery): print("Starting command executor loop...") @@ -50,7 +84,7 @@ async def command_executor(self, nursery): info = info_from_p2p_addr(maddr) await self.host.connect(info) - print("Connected to {info.peer_id}") + print(f"Connected to {info.peer_id}") if cmd == "ping" and len(parts) > 1: maddr = multiaddr.Multiaddr(parts[1]) @@ -59,6 +93,19 @@ async def command_executor(self, nursery): await self.host.connect(info) await self.ping_service.ping(info.peer_id, int(parts[2])) + if cmd == "join" and len(parts) > 1: + subscription = await self.pubsub.subscribe(parts[1]) + nursery.start_soon(self.receive_loop, subscription) + print(f"Subscribed to {parts[1]}") + + if cmd == "leave" and len(parts) > 1: + await self.pubsub.unsubscribe(parts[1]) + print(f"Unsubscribed to {parts[1]}") + + if cmd == "publish" and len(parts) > 2: + await self.pubsub.publish(parts[1], parts[2].encode()) + print(f"Published: {parts[2]}") + if cmd == "local": maddr = self.host.get_addrs()[0] print(maddr) diff --git a/examples/metrics/data/queries.active b/examples/metrics/data/queries.active deleted file mode 100644 index 8bfef0eabd49630443efda04a88fc8fb2b6b67bd..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 20001 zcmeIuu>b%700OW)^Y=)jV?$Vj0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEj zFkrxd0RsjM7%*VKfB^#r3>YwAz<>b*1`HT5V8DO@0|pEjFkrxd0RsjM7%*VKfB^#r H3|tHh6@UO- diff --git a/examples/metrics/data/wal/00000000 b/examples/metrics/data/wal/00000000 deleted file mode 100644 index f80ba9d8bcced89ed0e8c7e2ccd19a5cccc600ea..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 32768 zcmeI)2~<;88UWz?Ab}(VNib0(P;mi40a+AL2Z>VLpdz5MBq78QHA)tfprF>`+Ht9! zR%=xhM5wk?rJ_ZVR$HSFORY7U6k4UqDNUzTq)pcuO+yETgaiz+ouQk;(KDmC zz6!mbJ)Hq;E~)_|fVEc*fGuo-z*6`JL!<^CqlOWTbggeFS1~0`RUbxphXH2229$*p zz>a(!O#pk52Dl6v4gU>%S7UuB;T;NC$CRKf2m;7gZ-T*LrGE|_e7y&%wNrJS)Ea%7 zN~6-6R2x|K;S*Hza>{K=m>R_loq-5qYSdbt!6qP(qgR-e)6-P5RcY{V_On$6BYXq< zAisb>_*-*|;b1FVrFYdEbV`-cD4(r1n9>z#a*ax(Gi1uAWSUgQoMpf<9&knRz>Od8 zYET)~sqkIv698}&C4lby1ThSd!6Y}SVR(%yrB0h#wiAF(qDer+pTtt?)A{2Wa#LPV zDM0`y(L})GPZa3%Dy@9#;amWCqDbJxk7Q{S89$5zfS>rQkAq&VohF}>uAHeds}+ZlValVn(ox>6#oD8^>@U5`_aUGrXJ9pi;o+p>#9=-9@8W zHtG7(Pyk#+pV9*<}FSL@~gF zJyICM((8OwEn>w^V6_c#QN{e6tt3$uw>PQ{mHABWAn=G+;GRzy206lqSmA*m$vhb z2e8(H>-xTk)dP<8Z8b2@%h+S`-wW_?Ku^O>Pl|k$F)aIH)T6wVMEG}VkfSq}? zM@vrD^O6IpTv&3%Y*=}m$;#vKyy`f-(E3}9PV6}h;3^!&C94gyFBiB2vEw{$FCZ4M z1)z_R13clE7yRl6zkFZ~^8@0X`OHvYS!iTiPkiF&R*4m|Y7X5*gel>a5*h+LvWm{% z#|qCayMGxG8c$G4m=bv~^!?0dSmE)8fqf9+yK$5f%7c#S{x!S;C%p9LP)|N0+;|ZR zcMflLKRkr(hXQi;H=u&xHg`mVodPU+;te=Uc%vj1jypqMjyo*(Ki?m{$k8>aY2e(n z;laX1ugngrZ(J*~LzIP6t)o;E?1_ldJh&FT9CCQ%kmJebkYkP?oF`+(oMP=OV~(_N zu3zKPEL=9L-yZw)7sUC^EebkiX*&cf!ZNVV;N>F&5a+*)6m+I|x^-O<|4*#*@#C;F zi1S7h1)Y+KC8PGvOTs$&7Y}zqoKF{1(3$K7Hoaq-gLM-2GZtDpH}V`Q=#<8&zD$48 z8S9)d|BHCU*?V2PIUgQ=U+VR!Eyi2Pl}Ez<-_5`}7xbObgg6JgQP3$(t>`rF7=FC{ zZIk9C;vAPxL8qi*b=J)|^De^6&7qgO2-WV6Gmvp;W3nFwol^Hc-vo(GT&=)){NF6+@wFS^X7xtP#R}K` zFU;i4tCf5oqwOF=ORJUIIfny~S^UO>l~z+CtCgFucESRR^v(o!EDx?$B!`$nULPQq zX2G2(PxuOVOpu+Ds|KbwvQr|N9l2JLd)S=S#1~ugFGt57DTyafKRsuI^@arT znTS_Mo}IQ&e%szF3Hyw3pkX-H%UZB+1>#-oNnx*aN$u3yhUW9)7UowsGOme{U5e~= zg;w(-d;B+_=htG7pPV-Tz}F9DHaRQLR1%l}S~1>{lAtV2u( za&=^!0MD%9>d3eSzQWa!aTh$h{=@P}01uL%L}d}?^%0M}vIO|%J4>?0@G;&)R*(Z0 z;mHE8oL|)+pB~Re&^6^0M<-=}n>VfDAQoMBzu%Hx2)dCxd~Kgho3BL7f@PQm?+LII zUnDy4|4Vd3m#qosuj0+a^Xqt)=1mS&)HX}AK8mV+d$;*rDs*{nE!B$#WoiP)lKRiceL~W7(f#^L%&5ybI$H^C=eyE_wa(WuF%Q=|0`fJa~uQC7k^MfCWn?Sh?%xiDfMntUG~&-`Rt(6Uj5&ucx=B zJevzv%IgbO*ozw$nXl|wHUz=Tk}ua1=20xSpt)S-r8bwVt(@j^wN(ILz2MclF0gDp zzxxDB)(EfNKNCHWEj?+VV#?(2)3M3EbEWA{Iil8(C%TkYOX_PbS3Ii0tCvKL(IV=F zov5ysRZmlXV7h=;$K)3GL)2^6P+gsDe_jzkX)IoST`?O~e@dSHQu+y(ZyCS#y>h)tUPDFD36?qa&X?4=d$;C%Uug9u??^pH1azuT)ztz;pqNBllAS1QHC_XFjLbAu+ zM7&!ZA<979SN2ffEpghi?0x}$s`5L(N|KAX9}T0vTYB5ArYPtae7@c7wYx3do7vGs<1xIMk`?())td?erTSKG}E?|Q=v-&Uw&Ne5Qv+>6P@yR&Ad zY(m^KXHwrSS>CsY#-PNzqZ8I3+YFl*-0uKx&sl{>8bk5!M_;SskbHkQkNR%efjgB? zcYT3(?>MpsxkS&QetR$8{iujP0Lk~|Z0fsZbAnS#WUra0mKQbsgt#B{roKD*KQ8GHvcvHC zKFKXFwshy%f8K6x^1AZqLMzN$yCpMSZalf(h<9hK&6$U|=T}kRE!9MN&&(>oyQjK6 zo`bmmp`^ZBwmIjMz2)W$?=AQ50fB-wh&%rl_1(!muG~79VZQj@vU|gw%8wBDK4S-P z8-AYP=KPxHyeOw23UU9?)B)U@ag|FWUbC;*x9$mYl_lrm;&yYBKf@*etQF>|l5HN> zRrmDxaV@@Pkpao~JqPN$Wh+Lnk1kH~A@BskbZOzxYXHZFV_N-lwM`bhZHxf0GlA z9Eatuj_d&N4LIN=JZ>038830(1oE2+$FrBS1%hjsP72Is$YA=m^jepd&y>fQ|qi0XhP7 q1n3CR5uhVLM}Uq19RWH5bOh)K&=H^`Ku3U%038830(1m05%@3p! None: node.host.run(listen_addrs=listen_addrs), trio.open_nursery() as nursery, ): - # Start metrics service - metrics = Metrics() - - nursery.start_soon( - metrics.start_prometheus_server, node.host.metric_recv_channel - ) nursery.start_soon(node.host.get_peerstore().start_cleanup_task, 60) - nursery.start_soon(node.command_executor, nursery) - print(f"Host multiaddr: {node.host.get_addrs()[0]}") - await trio.sleep(1) - print("Entering intractive mode, type commands below.") - promt_session = PromptSession() - print(COMMANDS) + async with background_trio_service(node.pubsub): + async with background_trio_service(node.gossipsub): + await trio.sleep(1) + await node.pubsub.wait_until_ready() + print("Gossipsub and Pubsub services started !!") - while not node.termination_event.is_set(): - try: - _ = await trio.to_thread.run_sync(input) - user_input = await trio.to_thread.run_sync( - lambda: promt_session.prompt("Command> ") + # METRICS + metrics = Metrics() + nursery.start_soon( + metrics.start_prometheus_server, node.host.metric_recv_channel ) - cmds = user_input.strip().split(" ", 2) - await node.input_send_channel.send(cmds) - - except Exception as e: - print(f"Error in the interactive shell: {e}") + nursery.start_soon(node.command_executor, nursery) await trio.sleep(1) + print("Entering intractive mode, type commands below.") + promt_session = PromptSession() + print(COMMANDS) + + while not node.termination_event.is_set(): + try: + _ = await trio.to_thread.run_sync(input) + user_input = await trio.to_thread.run_sync( + lambda: promt_session.prompt("Command> ") + ) + cmds = user_input.strip().split(" ", 2) + await node.input_send_channel.send(cmds) + + except Exception as e: + print(f"Error in the interactive shell: {e}") + await trio.sleep(1) + print("Shutdown complete, Goodbye!") diff --git a/libp2p/__init__.py b/libp2p/__init__.py index f059c27ee..f7d774676 100644 --- a/libp2p/__init__.py +++ b/libp2p/__init__.py @@ -576,7 +576,7 @@ def new_host( enable_upnp=enable_upnp, negotiate_timeout=negotiate_timeout, resource_manager=resource_manager, - metric_recv_channel=metric_recv_channel + metric_recv_channel=metric_recv_channel, bootstrap_allow_ipv6=bootstrap_allow_ipv6, bootstrap_dns_timeout=bootstrap_dns_timeout, bootstrap_dns_max_retries=bootstrap_dns_max_retries, diff --git a/libp2p/metrics/dcutr.py b/libp2p/metrics/dcutr.py deleted file mode 100644 index 85d27c227..000000000 --- a/libp2p/metrics/dcutr.py +++ /dev/null @@ -1,26 +0,0 @@ -from prometheus_client import Counter - - -class DcutrEvent: - peer_id: str - success: bool - error: Exception | None = None - - -class DcutrMetrics: - events: Counter - - def __init__(self): - self.events = Counter( - "dcutr_events_total", - "Events emitted by the DCUtR behaviour", - labelnames=["event"], - ) - - def record(self, event: DcutrEvent) -> None: - if event.success: - label = "direct_connection_upgrade_succeeded" - else: - label = "direct_connection_upgrade_failed" - - self.events.labels(event=label).inc() diff --git a/libp2p/metrics/gossipsub.py b/libp2p/metrics/gossipsub.py index 9daf56557..a20874e60 100644 --- a/libp2p/metrics/gossipsub.py +++ b/libp2p/metrics/gossipsub.py @@ -1,31 +1,39 @@ from prometheus_client import Counter, Histogram -from libp2p.pubsub.gossipsub import GossipsubEvent +from libp2p.pubsub.pubsub import GossipsubEvent class GossipsubMetrics: - delivered: Counter - dropped: Counter - validated_fail: Counter + publish: Counter + subopts: Counter + control: Counter + + received: Counter msg_size: Histogram def __init__(self): - self.delivered = Counter( - "gossipsub_delivered_total", - "Messages successfully delivered", - labelnames=["topic"], + self.received = Counter( + "gossipsub_receiived_total", + "Messages successfully received", + labelnames=["peer_id"], + ) + + self.publish = Counter( + "gossipsub_publish_total", + "Messages to be published", + labelnames=["peer_id"], ) - self.dropped = Counter( - "gossipsub_dropped_total", - "Messages dropped", - labelnames=["topic", "reason"], + self.subopts = Counter( + "gossipsub_subopts_total", + "Messages notifying peer subscriptions", + labelnames=["peer_id"], ) - self.validated_fail = Counter( - "gossipsub_validation_failed_total", - "Messages rejected by validator", - labelnames=["topic", "error"], + self.control = Counter( + "gossipsub_control_total", + "Received control messages", + labelnames=["peer_id"], ) self.msg_size = Histogram( @@ -35,20 +43,16 @@ def __init__(self): ) def record(self, event: GossipsubEvent) -> None: - if event.delivered: - self.delivered.labels(topic=event.topic).inc() + self.received.labels(peer_id=event.peer_id).inc() + + if event.publish: + self.publish.labels(peer_id=event.peer_id).inc() + + if event.subopts: + self.subopts.labels(peer_id=event.peer_id).inc() + + if event.control: + self.control.labels(peer_id=event.peer_id).inc() if event.message_size is not None: self.msg_size.observe(event.message_size) - - if event.dropped_reason: - self.dropped.labels( - topic=event.topic, - reason=event.dropped_reason, - ).inc() - - if event.validation_error: - self.validated_fail.labels( - topic=event.topic, - error=type(event.validation_error).__name__, - ).inc() diff --git a/libp2p/metrics/kad.py b/libp2p/metrics/kad.py deleted file mode 100644 index b58ab8504..000000000 --- a/libp2p/metrics/kad.py +++ /dev/null @@ -1,165 +0,0 @@ -from prometheus_client import Counter, Histogram - - -class KadMetrics: - """ - Prometheus metrics for the Kademlia behaviour. - Mirrors the Rust libp2p metrics design. - """ - - def __init__(self): - # ------------------------- - # GetRecord metrics - # ------------------------- - - self.query_result_get_record_ok = Counter( - "kad_query_result_get_record_ok_total", - "Number of records returned by a successful Kademlia get record query", - ) - - self.query_result_get_record_error = Counter( - "kad_query_result_get_record_error_total", - "Number of failed Kademlia get record queries", - labelnames=["error"], - ) - - # ------------------------- - # GetClosestPeers metrics - # ------------------------- - - self.query_result_get_closest_peers_ok = Histogram( - "kad_query_result_get_closest_peers_ok", - "Number of closest peers returned by a successful query", - buckets=(1, 2, 4, 8, 16, 32, 64, 128, 256, 512), - ) - - self.query_result_get_closest_peers_error = Counter( - "kad_query_result_get_closest_peers_error_total", - "Number of failed get closest peers queries", - labelnames=["error"], - ) - - # ------------------------- - # GetProviders metrics - # ------------------------- - - self.query_result_get_providers_ok = Histogram( - "kad_query_result_get_providers_ok", - "Number of providers returned by a successful query", - buckets=(1, 2, 4, 8, 16, 32, 64, 128, 256, 512), - ) - - self.query_result_get_providers_error = Counter( - "kad_query_result_get_providers_error_total", - "Number of failed get providers queries", - labelnames=["error"], - ) - - # ------------------------- - # Query statistics - # ------------------------- - - self.query_result_num_requests = Histogram( - "kad_query_result_num_requests", - "Number of requests started for a Kademlia query", - labelnames=["type"], - buckets=(1, 2, 4, 8, 16, 32, 64, 128, 256, 512), - ) - - self.query_result_num_success = Histogram( - "kad_query_result_num_success", - "Number of successful requests of a Kademlia query", - labelnames=["type"], - buckets=(1, 2, 4, 8, 16, 32, 64, 128, 256, 512), - ) - - self.query_result_num_failure = Histogram( - "kad_query_result_num_failure", - "Number of failed requests of a Kademlia query", - labelnames=["type"], - buckets=(1, 2, 4, 8, 16, 32, 64, 128, 256, 512), - ) - - self.query_result_duration = Histogram( - "kad_query_result_duration_seconds", - "Duration of a Kademlia query", - labelnames=["type"], - buckets=(0.1, 0.2, 0.4, 0.8, 1.6, 3.2, 6.4, 12.8, 25.6), - ) - - # ------------------------- - # Routing table updates - # ------------------------- - - self.routing_updated = Counter( - "kad_routing_updated_total", - "Peers added, updated, or evicted from routing table", - labelnames=["action", "bucket"], - ) - - # ------------------------- - # inbound requests - # ------------------------- - - self.inbound_requests = Counter( - "kad_inbound_requests_total", - "Number of inbound requests", - labelnames=["request"], - ) - - # ----------------------------------------------------- - - def record_outbound_query(self, query_type, stats): - self.query_result_num_requests.labels(type=query_type).observe( - stats["num_requests"] - ) - - self.query_result_num_success.labels(type=query_type).observe( - stats["num_success"] - ) - - self.query_result_num_failure.labels(type=query_type).observe( - stats["num_failures"] - ) - - if stats.get("duration") is not None: - self.query_result_duration.labels(type=query_type).observe( - stats["duration"] - ) - - # ----------------------------------------------------- - - def record_get_record_ok(self): - self.query_result_get_record_ok.inc() - - def record_get_record_error(self, error): - self.query_result_get_record_error.labels(error=error).inc() - - # ----------------------------------------------------- - - def record_get_closest_peers_ok(self, peer_count): - self.query_result_get_closest_peers_ok.observe(peer_count) - - def record_get_closest_peers_error(self, error): - self.query_result_get_closest_peers_error.labels(error=error).inc() - - # ----------------------------------------------------- - - def record_get_providers_ok(self, provider_count): - self.query_result_get_providers_ok.observe(provider_count) - - def record_get_providers_error(self, error): - self.query_result_get_providers_error.labels(error=error).inc() - - # ----------------------------------------------------- - - def record_routing_update(self, action, bucket): - self.routing_updated.labels( - action=action, - bucket=str(bucket), - ).inc() - - # ----------------------------------------------------- - - def record_inbound_request(self, request_type): - self.inbound_requests.labels(request=request_type).inc() diff --git a/libp2p/metrics/kad_dht.py b/libp2p/metrics/kad_dht.py index e69de29bb..b58ab8504 100644 --- a/libp2p/metrics/kad_dht.py +++ b/libp2p/metrics/kad_dht.py @@ -0,0 +1,165 @@ +from prometheus_client import Counter, Histogram + + +class KadMetrics: + """ + Prometheus metrics for the Kademlia behaviour. + Mirrors the Rust libp2p metrics design. + """ + + def __init__(self): + # ------------------------- + # GetRecord metrics + # ------------------------- + + self.query_result_get_record_ok = Counter( + "kad_query_result_get_record_ok_total", + "Number of records returned by a successful Kademlia get record query", + ) + + self.query_result_get_record_error = Counter( + "kad_query_result_get_record_error_total", + "Number of failed Kademlia get record queries", + labelnames=["error"], + ) + + # ------------------------- + # GetClosestPeers metrics + # ------------------------- + + self.query_result_get_closest_peers_ok = Histogram( + "kad_query_result_get_closest_peers_ok", + "Number of closest peers returned by a successful query", + buckets=(1, 2, 4, 8, 16, 32, 64, 128, 256, 512), + ) + + self.query_result_get_closest_peers_error = Counter( + "kad_query_result_get_closest_peers_error_total", + "Number of failed get closest peers queries", + labelnames=["error"], + ) + + # ------------------------- + # GetProviders metrics + # ------------------------- + + self.query_result_get_providers_ok = Histogram( + "kad_query_result_get_providers_ok", + "Number of providers returned by a successful query", + buckets=(1, 2, 4, 8, 16, 32, 64, 128, 256, 512), + ) + + self.query_result_get_providers_error = Counter( + "kad_query_result_get_providers_error_total", + "Number of failed get providers queries", + labelnames=["error"], + ) + + # ------------------------- + # Query statistics + # ------------------------- + + self.query_result_num_requests = Histogram( + "kad_query_result_num_requests", + "Number of requests started for a Kademlia query", + labelnames=["type"], + buckets=(1, 2, 4, 8, 16, 32, 64, 128, 256, 512), + ) + + self.query_result_num_success = Histogram( + "kad_query_result_num_success", + "Number of successful requests of a Kademlia query", + labelnames=["type"], + buckets=(1, 2, 4, 8, 16, 32, 64, 128, 256, 512), + ) + + self.query_result_num_failure = Histogram( + "kad_query_result_num_failure", + "Number of failed requests of a Kademlia query", + labelnames=["type"], + buckets=(1, 2, 4, 8, 16, 32, 64, 128, 256, 512), + ) + + self.query_result_duration = Histogram( + "kad_query_result_duration_seconds", + "Duration of a Kademlia query", + labelnames=["type"], + buckets=(0.1, 0.2, 0.4, 0.8, 1.6, 3.2, 6.4, 12.8, 25.6), + ) + + # ------------------------- + # Routing table updates + # ------------------------- + + self.routing_updated = Counter( + "kad_routing_updated_total", + "Peers added, updated, or evicted from routing table", + labelnames=["action", "bucket"], + ) + + # ------------------------- + # inbound requests + # ------------------------- + + self.inbound_requests = Counter( + "kad_inbound_requests_total", + "Number of inbound requests", + labelnames=["request"], + ) + + # ----------------------------------------------------- + + def record_outbound_query(self, query_type, stats): + self.query_result_num_requests.labels(type=query_type).observe( + stats["num_requests"] + ) + + self.query_result_num_success.labels(type=query_type).observe( + stats["num_success"] + ) + + self.query_result_num_failure.labels(type=query_type).observe( + stats["num_failures"] + ) + + if stats.get("duration") is not None: + self.query_result_duration.labels(type=query_type).observe( + stats["duration"] + ) + + # ----------------------------------------------------- + + def record_get_record_ok(self): + self.query_result_get_record_ok.inc() + + def record_get_record_error(self, error): + self.query_result_get_record_error.labels(error=error).inc() + + # ----------------------------------------------------- + + def record_get_closest_peers_ok(self, peer_count): + self.query_result_get_closest_peers_ok.observe(peer_count) + + def record_get_closest_peers_error(self, error): + self.query_result_get_closest_peers_error.labels(error=error).inc() + + # ----------------------------------------------------- + + def record_get_providers_ok(self, provider_count): + self.query_result_get_providers_ok.observe(provider_count) + + def record_get_providers_error(self, error): + self.query_result_get_providers_error.labels(error=error).inc() + + # ----------------------------------------------------- + + def record_routing_update(self, action, bucket): + self.routing_updated.labels( + action=action, + bucket=str(bucket), + ).inc() + + # ----------------------------------------------------- + + def record_inbound_request(self, request_type): + self.inbound_requests.labels(request=request_type).inc() diff --git a/libp2p/metrics/metrics.py b/libp2p/metrics/metrics.py index 7e093c229..f0ea71432 100644 --- a/libp2p/metrics/metrics.py +++ b/libp2p/metrics/metrics.py @@ -4,10 +4,9 @@ import trio from libp2p.host.ping import PingEvent -from libp2p.metrics.dcutr import DcutrEvent, DcutrMetrics from libp2p.metrics.gossipsub import GossipsubMetrics from libp2p.metrics.ping import PingMetrics -from libp2p.pubsub.gossipsub import GossipsubEvent +from libp2p.pubsub.pubsub import GossipsubEvent def find_available_port(start_port: int = 8000, host: str = "127.0.0.1") -> int: @@ -28,7 +27,6 @@ class Metrics: def __init__(self): self.ping = PingMetrics() self.gossipsub = GossipsubMetrics() - self.dcutr = DcutrMetrics() async def start_prometheus_server( self, @@ -59,5 +57,3 @@ async def start_prometheus_server( self.ping.record(event) case GossipsubEvent(): self.gossipsub.record(event) - case DcutrEvent(): - self.dcutr.record(event) diff --git a/libp2p/network/swarm.py b/libp2p/network/swarm.py index 185626ee4..14d4c672c 100644 --- a/libp2p/network/swarm.py +++ b/libp2p/network/swarm.py @@ -830,8 +830,7 @@ async def upgrade_outbound_raw_conn( pass swarm_conn = await self.add_conn(muxed_conn, direction="outbound") - - swarm_conn._metric_send_channel = self.metric_send_channel + # swarm_conn._metric_send_channel = self.metric_send_channel logger.debug("successfully dialed peer %s", peer_id) return swarm_conn @@ -1526,6 +1525,7 @@ async def add_conn( self, direction=direction, ) + swarm_conn._metric_send_channel = self.metric_send_channel # Set actual transport addresses and connection type from the muxed connection. # This captures the real transport info (IP/port, direct vs relayed) diff --git a/libp2p/pubsub/gossipsub.py b/libp2p/pubsub/gossipsub.py index 46af9fad0..027275bc3 100644 --- a/libp2p/pubsub/gossipsub.py +++ b/libp2p/pubsub/gossipsub.py @@ -78,17 +78,6 @@ _MAX_PENDING_GRAFT_PRUNE_PER_PEER = 64 -class GossipsubEvent: - peer_id: str - topic: str - - # one of these should be set - message_size: int | None = None - delivered: bool = False - dropped_reason: str | None = None - validation_error: Exception | None = None - - class GossipSub(IPubsubRouter, Service): protocols: list[TProtocol] pubsub: Pubsub | None diff --git a/libp2p/pubsub/pubsub.py b/libp2p/pubsub/pubsub.py index 57197d486..2ab0c7ed7 100644 --- a/libp2p/pubsub/pubsub.py +++ b/libp2p/pubsub/pubsub.py @@ -281,6 +281,16 @@ def clear_expired(self) -> None: MAX_CONCURRENT_VALIDATORS = 10 +class GossipsubEvent: + peer_id: str + + publish: bool = False + subopts: bool = False + control: bool = False + + message_size: int | None = None + + class Pubsub(Service, IPubsub): host: IHost @@ -479,8 +489,13 @@ async def continuously_read_stream(self, stream: INetStream) -> None: ) continue + event = GossipsubEvent() + event.peer_id = peer_id + event.message_size = len(incoming) + if rpc_incoming.publish: # deal with RPC.publish + event.publish = True for msg in rpc_incoming.publish: if not self._is_subscribed_to_msg(msg): continue @@ -497,6 +512,7 @@ async def continuously_read_stream(self, stream: INetStream) -> None: # peers because a given node only needs its peers # to know that it is subscribed to the topic (doesn't # need everyone to know) + event.subopts = True for message in rpc_incoming.subscriptions: logger.debug( "received `subscriptions` message %s from peer %s", @@ -509,6 +525,7 @@ async def continuously_read_stream(self, stream: INetStream) -> None: # This is necessary because `control` is an optional field in pb2. # Ref: https://developers.google.com/protocol-buffers/docs/reference/python-generated#singular-fields-proto2 # noqa: E501 if rpc_incoming.HasField("control"): + event.control = True # Pass rpc to router so router could perform custom logic logger.debug( "received `control` message %s from peer %s", @@ -516,6 +533,8 @@ async def continuously_read_stream(self, stream: INetStream) -> None: peer_id, ) await self.router.handle_rpc(rpc_incoming, peer_id) + + await stream.metric_send_channel.send(event) except StreamEOF: logger.debug( f"Stream closed for peer {peer_id}, exiting read loop cleanly." From 50297a47ca7e89a9b44c68ca759eca166148b081 Mon Sep 17 00:00:00 2001 From: Abhinav Agarwalla Date: Fri, 20 Mar 2026 16:09:18 +0530 Subject: [PATCH 11/16] feat: fixed bugs in kad-dht metrics code --- libp2p/kad_dht/kad_dht.py | 31 ++++++ libp2p/metrics/kad_dht.py | 197 +++++++++++--------------------------- libp2p/metrics/metrics.py | 5 + 3 files changed, 90 insertions(+), 143 deletions(-) diff --git a/libp2p/kad_dht/kad_dht.py b/libp2p/kad_dht/kad_dht.py index 01aa23afc..9b9929228 100644 --- a/libp2p/kad_dht/kad_dht.py +++ b/libp2p/kad_dht/kad_dht.py @@ -104,6 +104,16 @@ def is_valid_timestamp(ts: float) -> bool: return True +class KadDhtEvent: + peer_id: str + + inbound: bool = False + find_node: bool = False + get_value: bool = False + put_value: bool = False + get_providers: bool = False + add_provider: bool = False + class KadDHT(Service): """ Kademlia DHT implementation for libp2p. @@ -473,6 +483,10 @@ async def handle_stream(self, stream: INetStream) -> None: f"Received DHT message from {peer_id}, type: {message.type}" ) + event = KadDhtEvent() + event.peer_id = peer_id + event.inbound = True + # Handle FIND_NODE message if message.type == Message.MessageType.FIND_NODE: # Get target key directly from protobuf @@ -492,6 +506,9 @@ async def handle_stream(self, stream: INetStream) -> None: await stream.close() return + # Metrics Event + event.find_node = True + # Build response message with protobuf response = Message() response.type = Message.MessageType.FIND_NODE @@ -553,6 +570,9 @@ async def handle_stream(self, stream: INetStream) -> None: await stream.close() return + # Metrics Event + event.add_provider = True + # Extract provider information for provider_proto in message.providerPeers: try: @@ -621,6 +641,9 @@ async def handle_stream(self, stream: INetStream) -> None: await stream.close() return + # Metrics event + event.get_providers = True + # Find providers for the key providers = self.provider_store.get_providers(key) logger.debug( @@ -715,6 +738,9 @@ async def handle_stream(self, stream: INetStream) -> None: await stream.close() return + # Metrics Event + event.get_value = True + value_record = self.value_store.get(key) if value_record: logger.debug(f"Found value for key {key.hex()}") @@ -807,6 +833,8 @@ async def handle_stream(self, stream: INetStream) -> None: await stream.close() return + event.put_value = True + try: if not (key and value): raise ValueError( @@ -848,6 +876,9 @@ async def handle_stream(self, stream: INetStream) -> None: except Exception as proto_err: logger.warning(f"Failed to parse protobuf message: {proto_err}") + # Send KAD-DHT event to Metrics + stream.metric_send_channel.send(event) + await stream.close() except Exception as e: logger.error(f"Error handling DHT stream: {e}") diff --git a/libp2p/metrics/kad_dht.py b/libp2p/metrics/kad_dht.py index b58ab8504..d480c0fe9 100644 --- a/libp2p/metrics/kad_dht.py +++ b/libp2p/metrics/kad_dht.py @@ -1,165 +1,76 @@ -from prometheus_client import Counter, Histogram +from prometheus_client import Counter +from libp2p.kad_dht.kad_dht import KadDhtEvent -class KadMetrics: - """ - Prometheus metrics for the Kademlia behaviour. - Mirrors the Rust libp2p metrics design. - """ +# COUNTER - def __init__(self): - # ------------------------- - # GetRecord metrics - # ------------------------- - - self.query_result_get_record_ok = Counter( - "kad_query_result_get_record_ok_total", - "Number of records returned by a successful Kademlia get record query", - ) - - self.query_result_get_record_error = Counter( - "kad_query_result_get_record_error_total", - "Number of failed Kademlia get record queries", - labelnames=["error"], - ) +# INBOUND_REQ +# FIND_NODE +# GET_VALUE +# PUT_VALUE +# GET_PROVIDERS +# ADD_PROVIDERS - # ------------------------- - # GetClosestPeers metrics - # ------------------------- - - self.query_result_get_closest_peers_ok = Histogram( - "kad_query_result_get_closest_peers_ok", - "Number of closest peers returned by a successful query", - buckets=(1, 2, 4, 8, 16, 32, 64, 128, 256, 512), - ) - - self.query_result_get_closest_peers_error = Counter( - "kad_query_result_get_closest_peers_error_total", - "Number of failed get closest peers queries", - labelnames=["error"], - ) - - # ------------------------- - # GetProviders metrics - # ------------------------- - - self.query_result_get_providers_ok = Histogram( - "kad_query_result_get_providers_ok", - "Number of providers returned by a successful query", - buckets=(1, 2, 4, 8, 16, 32, 64, 128, 256, 512), - ) - - self.query_result_get_providers_error = Counter( - "kad_query_result_get_providers_error_total", - "Number of failed get providers queries", - labelnames=["error"], - ) - - # ------------------------- - # Query statistics - # ------------------------- - - self.query_result_num_requests = Histogram( - "kad_query_result_num_requests", - "Number of requests started for a Kademlia query", - labelnames=["type"], - buckets=(1, 2, 4, 8, 16, 32, 64, 128, 256, 512), - ) +class KadDhtMetrics: + inbound: Counter + find_node: Counter + get_value: Counter + put_value: Counter + get_providers: Counter + add_provider: Counter - self.query_result_num_success = Histogram( - "kad_query_result_num_success", - "Number of successful requests of a Kademlia query", - labelnames=["type"], - buckets=(1, 2, 4, 8, 16, 32, 64, 128, 256, 512), - ) - - self.query_result_num_failure = Histogram( - "kad_query_result_num_failure", - "Number of failed requests of a Kademlia query", - labelnames=["type"], - buckets=(1, 2, 4, 8, 16, 32, 64, 128, 256, 512), - ) - - self.query_result_duration = Histogram( - "kad_query_result_duration_seconds", - "Duration of a Kademlia query", - labelnames=["type"], - buckets=(0.1, 0.2, 0.4, 0.8, 1.6, 3.2, 6.4, 12.8, 25.6), + def __init__(self): + self.inbound = Counter( + "kad_inbound_total", + "Total inbound requests received", + labelnames=["peer_id"], ) - # ------------------------- - # Routing table updates - # ------------------------- - - self.routing_updated = Counter( - "kad_routing_updated_total", - "Peers added, updated, or evicted from routing table", - labelnames=["action", "bucket"], + self.find_node = Counter( + "kad_inbound_find_node", + "Total inbound FIND_NODE requests received", + labelnames=["peer_id"], ) - # ------------------------- - # inbound requests - # ------------------------- - - self.inbound_requests = Counter( - "kad_inbound_requests_total", - "Number of inbound requests", - labelnames=["request"], + self.get_value = Counter( + "kad_inbound_get_value", + "Total inbound GET_VALUE requests received", + labelnames=["peer_id"], ) - # ----------------------------------------------------- - - def record_outbound_query(self, query_type, stats): - self.query_result_num_requests.labels(type=query_type).observe( - stats["num_requests"] + self.put_value = Counter( + "kad_inbound_put_value", + "Total inbound PUT_VALUE requests received", + labelnames=["peer_id"], ) - self.query_result_num_success.labels(type=query_type).observe( - stats["num_success"] + self.get_providers = Counter( + "kad_inbound_get_providers", + "Total inbound GET_PROVIDERS requests received", + labelnames=["peer_id"], ) - self.query_result_num_failure.labels(type=query_type).observe( - stats["num_failures"] + self.add_provider = Counter( + "kad_inbound_add_provider", + "Total inbound ADD_PROVIDER requests received", + labelnames=["peer_id"], ) - if stats.get("duration") is not None: - self.query_result_duration.labels(type=query_type).observe( - stats["duration"] - ) - - # ----------------------------------------------------- - - def record_get_record_ok(self): - self.query_result_get_record_ok.inc() - - def record_get_record_error(self, error): - self.query_result_get_record_error.labels(error=error).inc() - - # ----------------------------------------------------- - - def record_get_closest_peers_ok(self, peer_count): - self.query_result_get_closest_peers_ok.observe(peer_count) - - def record_get_closest_peers_error(self, error): - self.query_result_get_closest_peers_error.labels(error=error).inc() - - # ----------------------------------------------------- - - def record_get_providers_ok(self, provider_count): - self.query_result_get_providers_ok.observe(provider_count) + def record(self, event: KadDhtEvent) -> None: + if event.inbound: + self.inbound.labels(peer_id=event.peer_id).inc() - def record_get_providers_error(self, error): - self.query_result_get_providers_error.labels(error=error).inc() + if event.find_node: + self.find_node.labels(peer_id=event.peer_id).inc() - # ----------------------------------------------------- + if event.get_value: + self.get_value.labels(peer_id=event.peer_id).inc() - def record_routing_update(self, action, bucket): - self.routing_updated.labels( - action=action, - bucket=str(bucket), - ).inc() + if event.put_value: + self.put_value.labels(peer_id=event.peer_id).inc() - # ----------------------------------------------------- + if event.get_providers: + self.get_providers.labels(peer_id=event.peer_id).inc() - def record_inbound_request(self, request_type): - self.inbound_requests.labels(request=request_type).inc() + if event.add_provider: + self.add_provider.labels(peer_id=event.peer_id).inc() diff --git a/libp2p/metrics/metrics.py b/libp2p/metrics/metrics.py index f0ea71432..1beec8189 100644 --- a/libp2p/metrics/metrics.py +++ b/libp2p/metrics/metrics.py @@ -4,7 +4,9 @@ import trio from libp2p.host.ping import PingEvent +from libp2p.kad_dht.kad_dht import KadDhtEvent from libp2p.metrics.gossipsub import GossipsubMetrics +from libp2p.metrics.kad_dht import KadDhtMetrics from libp2p.metrics.ping import PingMetrics from libp2p.pubsub.pubsub import GossipsubEvent @@ -27,6 +29,7 @@ class Metrics: def __init__(self): self.ping = PingMetrics() self.gossipsub = GossipsubMetrics() + self.kad_dht = KadDhtMetrics() async def start_prometheus_server( self, @@ -57,3 +60,5 @@ async def start_prometheus_server( self.ping.record(event) case GossipsubEvent(): self.gossipsub.record(event) + case KadDhtEvent(): + self.kad_dht.record(event) From 31ce3f4549f59fc5d091762a08359b188b09dbc7 Mon Sep 17 00:00:00 2001 From: Abhinav Agarwalla Date: Sun, 22 Mar 2026 09:26:55 +0530 Subject: [PATCH 12/16] fix: kad-dht metrics working now --- examples/kademlia/server_node_addr.txt | 1 + examples/metrics/coordinator.py | 71 ++++++++++++++++++++++++-- examples/metrics/runner.py | 60 ++++++++++++---------- libp2p/kad_dht/kad_dht.py | 4 +- libp2p/metrics/kad_dht.py | 8 +++ libp2p/pubsub/pubsub.py | 4 +- 6 files changed, 114 insertions(+), 34 deletions(-) create mode 100644 examples/kademlia/server_node_addr.txt diff --git a/examples/kademlia/server_node_addr.txt b/examples/kademlia/server_node_addr.txt new file mode 100644 index 000000000..23f7025a2 --- /dev/null +++ b/examples/kademlia/server_node_addr.txt @@ -0,0 +1 @@ +/ip4/172.16.68.73/tcp/22943/p2p/16Uiu2HAmLipMqhDY6J6pbQzNJRVE9Ztt1qZfK6raPb5UDRwVM3Gz diff --git a/examples/metrics/coordinator.py b/examples/metrics/coordinator.py index e6293bc06..fa43ef36f 100644 --- a/examples/metrics/coordinator.py +++ b/examples/metrics/coordinator.py @@ -8,10 +8,13 @@ PingService, handle_ping, ) +from libp2p.kad_dht.kad_dht import DHTMode, KadDHT from libp2p.peer.id import ID from libp2p.peer.peerinfo import info_from_p2p_addr from libp2p.pubsub.gossipsub import GossipSub from libp2p.pubsub.pubsub import Pubsub +from libp2p.records.validator import Validator +from libp2p.utils.paths import get_script_dir, join_paths GOSSIPSUB_PROTOCOL_ID = TProtocol("/meshsub/1.0.0") COMMANDS = """ @@ -21,22 +24,37 @@ - join - Subscribe to a topic - leave - Unsubscribe to a topic - publish - Publish a message +- put - Execute PUT_VALUE in DHT +- get - Execute GET_VALUE in DHT +- advertize - Execute ADD_PROVIDER in DHT +- get_provider - Execute GET_PROVIDERS in DHT - local - List local multiaddr - help - List the existing commands - exit - Shut down """ +class ExampleValidator(Validator): + def validate(self, key: str, value: bytes) -> None: + if not value: + raise ValueError("Value cannot be empty") + + def select(self, key: str, values: list[bytes]) -> int: + return 0 + + class Node: - def __init__(self, listen_addrs: list[multiaddr.Multiaddr]): + def __init__( + self, listen_addrs: list[multiaddr.Multiaddr], dht_role: str + ): # Create a libp2p-host self.host = new_host(listen_addrs=listen_addrs, enable_metrics=True) - # Setup PING service + # PING self.host.set_stream_handler(PING_ID, handle_ping) self.ping_service = PingService(self.host) - # Set up Pubsub/Gossipsub + # Pubsub/Gossipsub self.gossipsub = GossipSub( protocols=[GOSSIPSUB_PROTOCOL_ID], degree=3, # Number of peers to maintain in mesh @@ -51,6 +69,14 @@ def __init__(self, listen_addrs: list[multiaddr.Multiaddr]): ) self.pubsub = Pubsub(self.host, self.gossipsub) + # KAD-DHT + if dht_role == "server": + dht_mode = DHTMode.SERVER + else: + dht_mode = DHTMode.CLIENT + self.dht = KadDHT(self.host, dht_mode) + self.dht.register_validator("exp", ExampleValidator()) + # CLI input send/receive channels self.input_send_channel, self.input_receive_channel = trio.open_memory_channel( 100 @@ -105,7 +131,44 @@ async def command_executor(self, nursery): if cmd == "publish" and len(parts) > 2: await self.pubsub.publish(parts[1], parts[2].encode()) print(f"Published: {parts[2]}") - + + if cmd == "put" and len(parts) > 2: + key = parts[1] + value = parts[2].encode() + + await self.dht.put_value(key, value) + print(f"Stored value: {value.decode()} with key: {key}") + + if cmd == "get" and len(parts) > 1: + key = parts[1] + + retrieved_value = await self.dht.get_value(key) + if retrieved_value: + print(f"Retrieved value: {retrieved_value.decode()}") + else: + print("Failed to retrieve") + + if cmd == "advertize" and len(parts) > 1: + content_id = parts[1] + + success = await self.dht.provide(content_id) + if success: + print(f"Advertised as provider for content: {content_id}") + else: + print("Failed to advertise as provider") + + if cmd == "get_provider" and len(parts) > 1: + content_id = parts[1] + + providers = await self.dht.find_providers(content_id) + if providers: + print( + f"Found {len(providers)} providers: " + f"{[p.peer_id for p in providers]}" + ) + else: + print("No providers found") + if cmd == "local": maddr = self.host.get_addrs()[0] print(maddr) diff --git a/examples/metrics/runner.py b/examples/metrics/runner.py index 077c6bb67..4175d34bd 100644 --- a/examples/metrics/runner.py +++ b/examples/metrics/runner.py @@ -3,16 +3,19 @@ from examples.metrics.coordinator import COMMANDS, Node from libp2p.metrics.metrics import Metrics -from libp2p.tools.async_service.trio_service import ( - background_trio_service, -) +from libp2p.tools.anyio_service.context import background_trio_service from libp2p.utils.address_validation import get_available_interfaces async def main() -> None: + promt_session = PromptSession() + # Create a libp2p-node instance listen_addrs = get_available_interfaces(0) - node = Node(listen_addrs=listen_addrs) + node = Node( + listen_addrs=listen_addrs, + dht_role= "server", + ) async with ( node.host.run(listen_addrs=listen_addrs), @@ -23,34 +26,35 @@ async def main() -> None: async with background_trio_service(node.pubsub): async with background_trio_service(node.gossipsub): - await trio.sleep(1) - await node.pubsub.wait_until_ready() - print("Gossipsub and Pubsub services started !!") + async with background_trio_service(node.dht): + await trio.sleep(1) + await node.pubsub.wait_until_ready() + print("Gossipsub and Pubsub services started !!") + print(f"DHT service started with {node.dht.mode} mode") - # METRICS - metrics = Metrics() - nursery.start_soon( - metrics.start_prometheus_server, node.host.metric_recv_channel - ) - nursery.start_soon(node.command_executor, nursery) - await trio.sleep(1) + # METRICS + metrics = Metrics() + nursery.start_soon( + metrics.start_prometheus_server, node.host.metric_recv_channel + ) + nursery.start_soon(node.command_executor, nursery) + await trio.sleep(1) - print("Entering intractive mode, type commands below.") - promt_session = PromptSession() - print(COMMANDS) + print("Entering intractive mode, type commands below.") + print(COMMANDS) - while not node.termination_event.is_set(): - try: - _ = await trio.to_thread.run_sync(input) - user_input = await trio.to_thread.run_sync( - lambda: promt_session.prompt("Command> ") - ) - cmds = user_input.strip().split(" ", 2) - await node.input_send_channel.send(cmds) + while not node.termination_event.is_set(): + try: + _ = await trio.to_thread.run_sync(input) + user_input = await trio.to_thread.run_sync( + lambda: promt_session.prompt("Command> ") + ) + cmds = user_input.strip().split(" ", 2) + await node.input_send_channel.send(cmds) - except Exception as e: - print(f"Error in the interactive shell: {e}") - await trio.sleep(1) + except Exception as e: + print(f"Error in the interactive shell: {e}") + await trio.sleep(1) print("Shutdown complete, Goodbye!") diff --git a/libp2p/kad_dht/kad_dht.py b/libp2p/kad_dht/kad_dht.py index 9b9929228..c271ad404 100644 --- a/libp2p/kad_dht/kad_dht.py +++ b/libp2p/kad_dht/kad_dht.py @@ -114,6 +114,7 @@ class KadDhtEvent: get_providers: bool = False add_provider: bool = False + class KadDHT(Service): """ Kademlia DHT implementation for libp2p. @@ -877,7 +878,8 @@ async def handle_stream(self, stream: INetStream) -> None: logger.warning(f"Failed to parse protobuf message: {proto_err}") # Send KAD-DHT event to Metrics - stream.metric_send_channel.send(event) + if stream.metric_send_channel is not None: + await stream.metric_send_channel.send(event) await stream.close() except Exception as e: diff --git a/libp2p/metrics/kad_dht.py b/libp2p/metrics/kad_dht.py index d480c0fe9..9e78f119e 100644 --- a/libp2p/metrics/kad_dht.py +++ b/libp2p/metrics/kad_dht.py @@ -59,18 +59,26 @@ def __init__(self): def record(self, event: KadDhtEvent) -> None: if event.inbound: self.inbound.labels(peer_id=event.peer_id).inc() + print("inbound") if event.find_node: self.find_node.labels(peer_id=event.peer_id).inc() + print("find_node") if event.get_value: self.get_value.labels(peer_id=event.peer_id).inc() + print("get_value") if event.put_value: self.put_value.labels(peer_id=event.peer_id).inc() + print("put_value") if event.get_providers: self.get_providers.labels(peer_id=event.peer_id).inc() + print("get_provider") if event.add_provider: self.add_provider.labels(peer_id=event.peer_id).inc() + print("add_provider") + + print("\n") diff --git a/libp2p/pubsub/pubsub.py b/libp2p/pubsub/pubsub.py index 2ab0c7ed7..0dd517eb9 100644 --- a/libp2p/pubsub/pubsub.py +++ b/libp2p/pubsub/pubsub.py @@ -534,7 +534,9 @@ async def continuously_read_stream(self, stream: INetStream) -> None: ) await self.router.handle_rpc(rpc_incoming, peer_id) - await stream.metric_send_channel.send(event) + if stream.metric_send_channel is not None: + await stream.metric_send_channel.send(event) + except StreamEOF: logger.debug( f"Stream closed for peer {peer_id}, exiting read loop cleanly." From 7cc86cee1e3d420d951ea8829fe6fb0390318012 Mon Sep 17 00:00:00 2001 From: Abhinav Agarwalla Date: Sun, 22 Mar 2026 13:38:55 +0530 Subject: [PATCH 13/16] feat: added metrics for swarm-connection cycle --- examples/metrics/coordinator.py | 30 +++-- examples/metrics/runner.py | 2 +- libp2p/metrics/bandwidth.py | 86 ------------ libp2p/metrics/kad_dht.py | 9 +- libp2p/metrics/metrics.py | 4 + libp2p/metrics/relay.py | 64 --------- libp2p/metrics/swarm.py | 228 +++++++------------------------- libp2p/network/swarm.py | 47 +++++++ 8 files changed, 114 insertions(+), 356 deletions(-) delete mode 100644 libp2p/metrics/bandwidth.py delete mode 100644 libp2p/metrics/relay.py diff --git a/examples/metrics/coordinator.py b/examples/metrics/coordinator.py index fa43ef36f..9113e9051 100644 --- a/examples/metrics/coordinator.py +++ b/examples/metrics/coordinator.py @@ -14,7 +14,6 @@ from libp2p.pubsub.gossipsub import GossipSub from libp2p.pubsub.pubsub import Pubsub from libp2p.records.validator import Validator -from libp2p.utils.paths import get_script_dir, join_paths GOSSIPSUB_PROTOCOL_ID = TProtocol("/meshsub/1.0.0") COMMANDS = """ @@ -24,7 +23,7 @@ - join - Subscribe to a topic - leave - Unsubscribe to a topic - publish - Publish a message -- put - Execute PUT_VALUE in DHT +- put - Execute PUT_VALUE in DHT - get - Execute GET_VALUE in DHT - advertize - Execute ADD_PROVIDER in DHT - get_provider - Execute GET_PROVIDERS in DHT @@ -44,9 +43,7 @@ def select(self, key: str, values: list[bytes]) -> int: class Node: - def __init__( - self, listen_addrs: list[multiaddr.Multiaddr], dht_role: str - ): + def __init__(self, listen_addrs: list[multiaddr.Multiaddr], dht_role: str): # Create a libp2p-host self.host = new_host(listen_addrs=listen_addrs, enable_metrics=True) @@ -89,6 +86,11 @@ async def receive_loop(self, subsription): while not self.termination_event.is_set(): try: message = await subsription.get() + + from_peer_id = ID(message.from_id).to_base58() + if from_peer_id == self.host.get_id().pretty(): + continue + print(f"From: {ID(message.from_id).to_base58()}") print(f"Received: {message.data.decode('utf-8')}") except Exception: @@ -131,35 +133,35 @@ async def command_executor(self, nursery): if cmd == "publish" and len(parts) > 2: await self.pubsub.publish(parts[1], parts[2].encode()) print(f"Published: {parts[2]}") - + if cmd == "put" and len(parts) > 2: key = parts[1] value = parts[2].encode() - + await self.dht.put_value(key, value) print(f"Stored value: {value.decode()} with key: {key}") - + if cmd == "get" and len(parts) > 1: key = parts[1] - + retrieved_value = await self.dht.get_value(key) if retrieved_value: print(f"Retrieved value: {retrieved_value.decode()}") else: print("Failed to retrieve") - + if cmd == "advertize" and len(parts) > 1: content_id = parts[1] - + success = await self.dht.provide(content_id) if success: print(f"Advertised as provider for content: {content_id}") else: print("Failed to advertise as provider") - + if cmd == "get_provider" and len(parts) > 1: content_id = parts[1] - + providers = await self.dht.find_providers(content_id) if providers: print( @@ -168,7 +170,7 @@ async def command_executor(self, nursery): ) else: print("No providers found") - + if cmd == "local": maddr = self.host.get_addrs()[0] print(maddr) diff --git a/examples/metrics/runner.py b/examples/metrics/runner.py index 4175d34bd..33e68527d 100644 --- a/examples/metrics/runner.py +++ b/examples/metrics/runner.py @@ -14,7 +14,7 @@ async def main() -> None: listen_addrs = get_available_interfaces(0) node = Node( listen_addrs=listen_addrs, - dht_role= "server", + dht_role="server", ) async with ( diff --git a/libp2p/metrics/bandwidth.py b/libp2p/metrics/bandwidth.py deleted file mode 100644 index dd409f067..000000000 --- a/libp2p/metrics/bandwidth.py +++ /dev/null @@ -1,86 +0,0 @@ -from prometheus_client import Counter - - -class BandwidthMetrics: - """ - Prometheus bandwidth metrics for libp2p transport streams. - """ - - def __init__(self): - self.bandwidth = Counter( - "libp2p_bandwidth_bytes_total", - "Bandwidth usage by direction and protocol stack", - ["direction", "protocols"], - ) - - def outbound(self, protocols, n): - self.bandwidth.labels( - direction="outbound", - protocols=protocols, - ).inc(n) - - def inbound(self, protocols, n): - self.bandwidth.labels( - direction="inbound", - protocols=protocols, - ).inc(n) - - -class InstrumentedStream: - """ - Wraps a stream to measure bandwidth. - """ - - def __init__(self, stream, metrics: BandwidthMetrics, protocols: str): - self.stream = stream - self.metrics = metrics - self.protocols = protocols - - async def read(self, n=-1): - data = await self.stream.read(n) - - if data: - self.metrics.inbound(self.protocols, len(data)) - - return data - - async def write(self, data: bytes): - n = await self.stream.write(data) - - if n is None: - n = len(data) - - self.metrics.outbound(self.protocols, n) - - return n - - async def close(self): - await self.stream.close() - - -class TransportWrapper: - """ - Wraps a transport and instruments bandwidth. - """ - - def __init__(self, transport, metrics: BandwidthMetrics): - self.transport = transport - self.metrics = metrics - - async def dial(self, addr, protocols): - stream = await self.transport.dial(addr) - - return InstrumentedStream( - stream, - self.metrics, - protocols, - ) - - async def accept(self, protocols): - stream = await self.transport.accept() - - return InstrumentedStream( - stream, - self.metrics, - protocols, - ) diff --git a/libp2p/metrics/kad_dht.py b/libp2p/metrics/kad_dht.py index 9e78f119e..a8d30f10d 100644 --- a/libp2p/metrics/kad_dht.py +++ b/libp2p/metrics/kad_dht.py @@ -11,6 +11,7 @@ # GET_PROVIDERS # ADD_PROVIDERS + class KadDhtMetrics: inbound: Counter find_node: Counter @@ -59,26 +60,18 @@ def __init__(self): def record(self, event: KadDhtEvent) -> None: if event.inbound: self.inbound.labels(peer_id=event.peer_id).inc() - print("inbound") if event.find_node: self.find_node.labels(peer_id=event.peer_id).inc() - print("find_node") if event.get_value: self.get_value.labels(peer_id=event.peer_id).inc() - print("get_value") if event.put_value: self.put_value.labels(peer_id=event.peer_id).inc() - print("put_value") if event.get_providers: self.get_providers.labels(peer_id=event.peer_id).inc() - print("get_provider") if event.add_provider: self.add_provider.labels(peer_id=event.peer_id).inc() - print("add_provider") - - print("\n") diff --git a/libp2p/metrics/metrics.py b/libp2p/metrics/metrics.py index 1beec8189..175c81317 100644 --- a/libp2p/metrics/metrics.py +++ b/libp2p/metrics/metrics.py @@ -8,6 +8,7 @@ from libp2p.metrics.gossipsub import GossipsubMetrics from libp2p.metrics.kad_dht import KadDhtMetrics from libp2p.metrics.ping import PingMetrics +from libp2p.metrics.swarm import SwarmEvent, SwarmMetrics from libp2p.pubsub.pubsub import GossipsubEvent @@ -30,6 +31,7 @@ def __init__(self): self.ping = PingMetrics() self.gossipsub = GossipsubMetrics() self.kad_dht = KadDhtMetrics() + self.swarm = SwarmMetrics() async def start_prometheus_server( self, @@ -62,3 +64,5 @@ async def start_prometheus_server( self.gossipsub.record(event) case KadDhtEvent(): self.kad_dht.record(event) + case SwarmEvent(): + self.swarm.record(event) diff --git a/libp2p/metrics/relay.py b/libp2p/metrics/relay.py deleted file mode 100644 index 464f57067..000000000 --- a/libp2p/metrics/relay.py +++ /dev/null @@ -1,64 +0,0 @@ -from dataclasses import dataclass - -from prometheus_client import Counter - - -@dataclass(slots=True) -class RelayEvent: - """ - Event emitted by the relay behaviour. - - Only the event type is required because the metrics layer - simply counts occurrences of each event type. - """ - - event_type: str - - -class RelayEventType: - """ - Equivalent of the Rust `EventType` enum. - """ - - RESERVATION_REQ_ACCEPTED = "ReservationReqAccepted" - RESERVATION_REQ_ACCEPT_FAILED = "ReservationReqAcceptFailed" - RESERVATION_REQ_DENIED = "ReservationReqDenied" - RESERVATION_REQ_DENY_FAILED = "ReservationReqDenyFailed" - RESERVATION_CLOSED = "ReservationClosed" - RESERVATION_TIMED_OUT = "ReservationTimedOut" - - CIRCUIT_REQ_DENIED = "CircuitReqDenied" - CIRCUIT_REQ_DENY_FAILED = "CircuitReqDenyFailed" - CIRCUIT_REQ_OUTBOUND_CONNECT_FAILED = "CircuitReqOutboundConnectFailed" - - CIRCUIT_REQ_ACCEPTED = "CircuitReqAccepted" - CIRCUIT_REQ_ACCEPT_FAILED = "CircuitReqAcceptFailed" - - CIRCUIT_CLOSED = "CircuitClosed" - - -class RelayMetrics: - """ - Prometheus metrics for relay behaviour. - - Equivalent to the Rust implementation: - - Family - - which becomes a Counter with labels in the Python Prometheus client. - """ - - events: Counter - - def __init__(self) -> None: - self.events = Counter( - "relay_events_total", - "Events emitted by the relay NetworkBehaviour", - labelnames=["event"], - ) - - def record(self, event: RelayEvent) -> None: - """ - Record a relay event. - """ - self.events.labels(event=event.event_type).inc() diff --git a/libp2p/metrics/swarm.py b/libp2p/metrics/swarm.py index 0e307334a..fa868e206 100644 --- a/libp2p/metrics/swarm.py +++ b/libp2p/metrics/swarm.py @@ -1,6 +1,20 @@ -import time +from prometheus_client import Counter -from prometheus_client import Counter, Histogram +# METRICS + +# conn_incoming +# conn_incoming_error +# dial_attemp +# conn_outgoing_error + + +class SwarmEvent: + peer_id: str | None = None + + conn_incoming: bool = False + conn_incoming_error: bool = False + dial_attempt: bool = False + dial_attempt_error: bool = False class SwarmMetrics: @@ -9,197 +23,45 @@ class SwarmMetrics: Mirrors the Rust libp2p metrics implementation. """ - def __init__(self): - # --------------------------- - # incoming connections - # --------------------------- - - self.connections_incoming = Counter( - "swarm_connections_incoming_total", - "Number of incoming connections per address stack", - ["protocols"], - ) - - self.connections_incoming_error = Counter( - "swarm_connections_incoming_error_total", - "Number of incoming connection errors", - ["error", "protocols"], - ) - - # --------------------------- - # connection lifecycle - # --------------------------- - - self.connections_established = Counter( - "swarm_connections_established_total", - "Number of connections established", - ["role", "protocols"], - ) - - self.connections_establishment_duration = Histogram( - "swarm_connections_establishment_duration_seconds", - "Time taken to establish connection", - ["role", "protocols"], - buckets=(0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1, 2, 5, 10), - ) - - self.connections_duration = Histogram( - "swarm_connections_duration_seconds", - "Time a connection was alive", - ["role", "protocols", "cause"], - buckets=(0.01, 0.1, 1, 5, 10, 30, 60, 300, 600), - ) - - # --------------------------- - # listening addresses - # --------------------------- - - self.new_listen_addr = Counter( - "swarm_new_listen_addr_total", - "Number of new listen addresses", - ["protocols"], - ) - - self.expired_listen_addr = Counter( - "swarm_expired_listen_addr_total", - "Number of expired listen addresses", - ["protocols"], - ) - - # --------------------------- - # external addresses - # --------------------------- - - self.external_addr_candidates = Counter( - "swarm_external_addr_candidates_total", - "Number of new external address candidates", - ["protocols"], - ) - - self.external_addr_confirmed = Counter( - "swarm_external_addr_confirmed_total", - "Number of confirmed external addresses", - ["protocols"], - ) - - self.external_addr_expired = Counter( - "swarm_external_addr_expired_total", - "Number of expired external addresses", - ["protocols"], - ) + conn_incoming: Counter + conn_incoming_error: Counter + dial_attempt: Counter + dial_attempt_error: Counter - # --------------------------- - # listener lifecycle - # --------------------------- - - self.listener_closed = Counter( - "swarm_listener_closed_total", - "Number of listeners closed", - ["protocols"], + def __init__(self): + self.conn_incoming = Counter( + "swarm_incoming_conn", + "Incoming connection received by libp2p-swarm", + labelnames=["peer_id"], ) - self.listener_error = Counter( - "swarm_listener_error_total", - "Number of listener errors", + self.conn_incoming_error = Counter( + "swarm_incoming_conn_error", + "Incoming connection failure in libp2p-swarm", + labelnames=["peer_id"], ) - # --------------------------- - # dialing - # --------------------------- - self.dial_attempt = Counter( - "swarm_dial_attempt_total", - "Number of dial attempts", + "swarm_dial_attempt", + "Dial attempts made by libp2p-swarm", + labelnames=["peer_id"], ) - self.outgoing_connection_error = Counter( - "swarm_outgoing_connection_error_total", - "Outgoing connection errors", - ["peer", "error"], + self.dial_attempt_error = Counter( + "swarm_dial_attempt_error", + "Outgoing connection failure in libp2p-swarm", + labelnames=["peer_id"], ) - # --------------------------- - # connection tracking - # --------------------------- - - self.connections = {} - - # ------------------------------------------------- - - def record(self, event): - """ - Record a SwarmEvent-like object. - """ - etype = event["type"] - - if etype == "ConnectionEstablished": - role = event["role"] - protocols = event["protocols"] - conn_id = event["connection_id"] - duration = event.get("established_in", 0) - - self.connections_established.labels( - role=role, - protocols=protocols, - ).inc() - - self.connections_establishment_duration.labels( - role=role, - protocols=protocols, - ).observe(duration) - - self.connections[conn_id] = time.time() - - elif etype == "ConnectionClosed": - conn_id = event["connection_id"] - role = event["role"] - protocols = event["protocols"] - cause = event.get("cause", "None") - - if conn_id in self.connections: - elapsed = time.time() - self.connections.pop(conn_id) - - self.connections_duration.labels( - role=role, - protocols=protocols, - cause=cause, - ).observe(elapsed) - - elif etype == "IncomingConnection": - self.connections_incoming.labels(protocols=event["protocols"]).inc() - - elif etype == "IncomingConnectionError": - self.connections_incoming_error.labels( - error=event["error"], - protocols=event["protocols"], - ).inc() - - elif etype == "OutgoingConnectionError": - self.outgoing_connection_error.labels( - peer=event["peer"], - error=event["error"], - ).inc() - - elif etype == "NewListenAddr": - self.new_listen_addr.labels(protocols=event["protocols"]).inc() - - elif etype == "ExpiredListenAddr": - self.expired_listen_addr.labels(protocols=event["protocols"]).inc() - - elif etype == "ListenerClosed": - self.listener_closed.labels(protocols=event["protocols"]).inc() - - elif etype == "ListenerError": - self.listener_error.inc() - - elif etype == "Dialing": - self.dial_attempt.inc() + def record(self, event: SwarmEvent) -> None: + if event.conn_incoming: + self.conn_incoming.labels(peer_id=event.peer_id).inc() - elif etype == "NewExternalAddrCandidate": - self.external_addr_candidates.labels(protocols=event["protocols"]).inc() + if event.conn_incoming_error: + self.conn_incoming_error.labels(peer_id=event.peer_id).inc() - elif etype == "ExternalAddrConfirmed": - self.external_addr_confirmed.labels(protocols=event["protocols"]).inc() + if event.dial_attempt: + self.dial_attempt.labels(peer_id=event.peer_id).inc() - elif etype == "ExternalAddrExpired": - self.external_addr_expired.labels(protocols=event["protocols"]).inc() + if event.dial_attempt_error: + self.dial_attempt_error.labels(peer_id=event.peer_id).inc() diff --git a/libp2p/network/swarm.py b/libp2p/network/swarm.py index 14d4c672c..4aaccf5ca 100644 --- a/libp2p/network/swarm.py +++ b/libp2p/network/swarm.py @@ -7,6 +7,7 @@ import random from typing import TYPE_CHECKING, Any, cast +from libp2p.metrics.swarm import SwarmEvent from libp2p.rcmgr import Direction if TYPE_CHECKING: @@ -490,6 +491,14 @@ async def dial_peer(self, peer_id: ID) -> list[INetConn]: :raises SwarmException: raised when an error occurs :return: list of muxed connections """ + # Emit metric-event for dial-attempt + event = SwarmEvent() + event.peer_id = peer_id + event.dial_attempt = True + + if self.metric_send_channel is not None: + await self.metric_send_channel.send(event) + # Check if we already have connections existing_connections = self.get_connections(peer_id) if existing_connections: @@ -546,6 +555,15 @@ async def dial_peer(self, peer_id: ID) -> list[INetConn]: if not connections: # Tried all addresses, raising exception. + + # Emit metric-event for dial_attempt failure + event = SwarmEvent() + event.peer_id = peer_id + event.dial_attempt_error = True + + if self.metric_send_channel is not None: + await self.metric_send_channel.send(event) + raise SwarmDialAllFailedError( f"unable to connect to {peer_id}, no addresses established a " "successful connection (with exceptions)", @@ -1148,14 +1166,30 @@ async def conn_handler( remote_maddr = self._build_remote_multiaddr(read_write_closer) logger.debug(f"[conn_handler] Built remote_maddr: {remote_maddr}") + # Emit a metric-event that we received an inbound connection + inbound_notification = SwarmEvent() + inbound_notification.conn_incoming = True + if self.metric_send_channel is not None: + await self.metric_send_channel.send(inbound_notification) + + # Metric event for inbound connection failure + failure_event = SwarmEvent() + if remote_maddr is not None: if not await self.connection_gate.is_allowed(remote_maddr): logger.debug( "Inbound connection from %s denied by connection gate", remote_maddr, ) + # INbound error try: await read_write_closer.close() + + # Emit event for incoming conn failure + failure_event.conn_incoming_error = True + if self.metric_send_channel is not None: + await self.metric_send_channel.send(failure_event) + except Exception: pass return @@ -1172,8 +1206,15 @@ async def conn_handler( # NOTE: This is a intentional barrier to prevent from the # handler exiting and closing the connection. await self.manager.wait_finished() + except Exception: await read_write_closer.close() + + # Emit event for incoming conn failure + failure_event.conn_incoming_error = True + if self.metric_send_channel is not None: + await self.metric_send_channel.send(failure_event) + return # For non-QUIC connections, wrap in try/except to ensure cleanup @@ -1194,6 +1235,12 @@ async def conn_handler( # If raw_conn wasn't created, # close the underlying connection await read_write_closer.close() + + # Emit event for incoming conn failure + failure_event.conn_incoming_error = True + if self.metric_send_channel is not None: + await self.metric_send_channel.send(failure_event) + except Exception: pass # Re-raise to let the listener handle it appropriately From 5fb3087230191e311d94105cde59b8d4ad9dbe96 Mon Sep 17 00:00:00 2001 From: Abhinav Agarwalla Date: Sun, 22 Mar 2026 18:50:14 +0530 Subject: [PATCH 14/16] chore: fixed all linter errors --- docs/examples.metrics.rst | 168 ++++++++++++++++++ docs/examples.rst | 1 + docs/libp2p.metrics.rst | 53 ++++++ docs/libp2p.rst | 1 + examples/kademlia/server_node_addr.txt | 1 - examples/metrics/README.md | 167 +++++++++++++++++ examples/metrics/prometheus.yml | 9 - examples/metrics/runner.py | 6 +- libp2p/__init__.py | 2 +- libp2p/abc.py | 12 +- libp2p/host/basic_host.py | 14 +- libp2p/host/ping.py | 15 +- libp2p/host/routed_host.py | 10 +- libp2p/kad_dht/kad_dht.py | 2 +- libp2p/metrics/gossipsub.py | 4 +- libp2p/metrics/kad_dht.py | 2 +- libp2p/metrics/metrics.py | 25 +-- libp2p/metrics/ping.py | 4 +- libp2p/metrics/swarm.py | 2 +- libp2p/network/connection/swarm_connection.py | 2 +- libp2p/network/stream/net_stream.py | 5 +- libp2p/network/swarm.py | 6 +- libp2p/pubsub/pubsub.py | 2 +- newsfragments/1199.rst | 1 + .../network/test_net_stream_concurrency.py | 8 +- .../network/test_net_stream_error_state.py | 4 +- .../test_net_stream_state_transitions.py | 2 +- 27 files changed, 462 insertions(+), 66 deletions(-) create mode 100644 docs/examples.metrics.rst create mode 100644 docs/libp2p.metrics.rst delete mode 100644 examples/kademlia/server_node_addr.txt create mode 100644 examples/metrics/README.md delete mode 100644 examples/metrics/prometheus.yml create mode 100644 newsfragments/1199.rst diff --git a/docs/examples.metrics.rst b/docs/examples.metrics.rst new file mode 100644 index 000000000..6065a719e --- /dev/null +++ b/docs/examples.metrics.rst @@ -0,0 +1,168 @@ +Metrics Demo +============ + +This example demonstrates how to run multiple libp2p services (Ping, Pubsub/Gossipsub, Kad-DHT) in a single node and observer +their behaviour through Prometheus + Grafana metrics dashboards. + +.. code-block:: console + + $ python -m pip install libp2p + Collecting libp2p + ... + Successfully installed libp2p-x.x.x + + $ metrics-demo + Host multiaddr: /ip4/172.16.68.73/tcp/41173/p2p/12D3KooWD2DFvDs4wekLWU8sAUJJgivbRbiiKkX9yQ3kGhuCwCqL + Gossipsub and Pubsub services started !! + DHT service started with DHTMode.SERVER mode + Starting command executor loop... + + Prometheus metrics visible at: http://localhost:8000 + + To start prometheus and grafana dashboards, from another terminal: + PROMETHEUS_PORT=9001 GRAFANA_PORT=7001 docker compose up + + After this: + Prometheus dashboard will be visible at: http://localhost:9001 + Grafana dashboard will be visible at: http://localhost:7001 + + Entering intractive mode, type commands below. + + Available commands: + - connect - Connect to another peer + ... + +Now in this way a node can be started, now start another node in a different terminal +and make a connection between so that they can communicate: + +.. code-block:: console + + $ metrics-demo + $ connect /ip4/172.16.68.73/tcp/41173/p2p/12D3KooWD2DFvDs4wekLWU8sAUJJgivbRbiiKkX9yQ3kGhuCwCqL + Connected to 12D3KooWD2DFvDs4wekLWU8sAUJJgivbRbiiKkX9yQ3kGhuCwCqL + +Now we can communicate between the 2 nodes via Ping, Gossipsub and Kad-DHT. Before that we have to +start the prometheus and grafana dashboards. For this create a `docker-compose.yml` file like this: + +.. code-block:: console + + services: + prometheus: + image: prom/prometheus:latest + container_name: prometheus + volumes: + - ./prometheus.yml:/etc/prometheus/prometheus.yml:ro + ports: + - "${PROMETHEUS_PORT}:9090" + extra_hosts: + - "host.docker.internal:host-gateway" + + grafana: + image: grafana/grafana:latest + container_name: grafana + ports: + - "${GRAFANA_PORT}:3000" + depends_on: + - prometheus + +And run it like this +.. code-block:: console + + PROMETHEUS_PORT=9001 GRAFANA_PORT=7001 docker compose up + +A similar file is present in `py-libp2p/libp2p/metrics` directory also, so either create a new docker-compose +file or run it from the above path. This basically starts a prometheus and grafana server in your localhost, +with which the metrics can be viewed in graph format. + +Now see how to communicate between the 2 nodes, via Pubsub/Gossipsub, Ping and Kad-DHT + +PING +==== + +The following metrics are exposed in this service: +- ping: Round-trip time sending a `ping` and receiving a `pong` +- ping_failure: Failure while sending a ping or receiving a ping + +.. code-block:: console + + $ ping /ip4/172.16.68.73/tcp/41173/p2p/12D3KooWD2DFvDs4wekLWU8sAUJJgivbRbiiKkX9yQ3kGhuCwCqL 15 + [401, 419, 428, 353, 354, 353, 369, 371, 353, 380, 352, 343, 378, 324, 412] + +The output will the rtts took for each ping/ping to complete. +The updated metrics can be visualized in the dashboards. + +Pubsub/Gossipsub +================ + +The following metrics are exposed in this service: +- gossipsub_received_total: Messages successfully received +- gossipsub_publish_total: Messages to be published +- gossipsub_subopts_total: Messages notifying peer subscriptions +- gossipsub_control_total: Received control messages +- gossipsub_message_bytes: Message size in bytes + +To communicate via gossipsub, join the same topics on both the nodes and publish messages +on that topic to get it received on both sides. + +.. code-block:: console + + $ join pubsub-chat + Subscribed to pubsub-chat + Starting receive loop + +Do this on both the terminals. Then publish a message from one side, and see it recieved on the other side. + +.. code-block:: console + + $ publish pubsub-chat hello-from-pubsub! + +See the updated metrics in the dashboards. + +KAD-DHT +======= + +The following metrics are exposed in this service: +- kad_inbound_total: Total inbound requests received +- kad_inbound_find_node: Total inbound FIND_NODE requests received +- kad_inbound_get_value: Total inbound GET_VALUE requests received +- kad_inbound_put_value: Total inbound PUT_VALUE requests received +- kad_inbound_get_providers: Total inbound GET_PROVIDERS requests received +- kad_inbound_add_provider: Total inbound ADD_PROVIDER requests received + +To intercat between the 2 nodes via kad-dht, we have 2 ways: +- `PUT_VAUE` in one node, and `GET_VALUE` in another +- `ADD_PROVIDER` in one node, and `GET_PROVIDERS` in another + +.. code-block:: console + + $ put /exp/fa kad-dht-value + Stored value: kad-dht-value with key: /exp/fa + + # From another terminal + $ get /exp/fa + Retrieved value: kad-dht-value + +.. code-block:: console + + $ advertize content-id + Advertised as provider for content: content-id + + # From another terminal + $ get_provider content-id + Found 1 providers: [] + +SWARM-CONNECTION-EVENTS +======================= + +Other than the above 3 services, the incoming/outgoing connection cycle is also monitored via the +following metrics: +- swarm_incoming_conn: Incoming connection received by libp2p-swarm +- swarm_incoming_conn_error: Incoming connection failure in libp2p-swarm +- swarm_dial_attempt: Dial attempts made by libp2p-swarm +- swarm_dial_attempt_error: Outgoing connection failure in libp2p-swarm + +The full source code for this example is below: + +.. literalinclude:: ../examples/metrics/runner.py + :language: python + :linenos: diff --git a/docs/examples.rst b/docs/examples.rst index 09f0edc59..7e1d69c42 100644 --- a/docs/examples.rst +++ b/docs/examples.rst @@ -29,3 +29,4 @@ Examples examples.autotls examples.perf examples.path_handling + examples.metrics diff --git a/docs/libp2p.metrics.rst b/docs/libp2p.metrics.rst new file mode 100644 index 000000000..8a1b31797 --- /dev/null +++ b/docs/libp2p.metrics.rst @@ -0,0 +1,53 @@ +libp2p.metrics package +====================== + +Submodules +---------- + +libp2p.metrics.gossipsub module +------------------------------- + +.. automodule:: libp2p.metrics.gossipsub + :members: + :undoc-members: + :show-inheritance: + +libp2p.metrics.kad_dht module +----------------------------- + +.. automodule:: libp2p.metrics.kad_dht + :members: + :undoc-members: + :show-inheritance: + +libp2p.metrics.metrics module +----------------------------- + +.. automodule:: libp2p.metrics.metrics + :members: + :undoc-members: + :show-inheritance: + +libp2p.metrics.ping module +-------------------------- + +.. automodule:: libp2p.metrics.ping + :members: + :undoc-members: + :show-inheritance: + +libp2p.metrics.swarm module +--------------------------- + +.. automodule:: libp2p.metrics.swarm + :members: + :undoc-members: + :show-inheritance: + +Module contents +--------------- + +.. automodule:: libp2p.metrics + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/libp2p.rst b/docs/libp2p.rst index fb2ab82b0..21a0cdb3f 100644 --- a/docs/libp2p.rst +++ b/docs/libp2p.rst @@ -28,6 +28,7 @@ Subpackages libp2p.tools libp2p.transport libp2p.utils + libp2p.metrics Submodules ---------- diff --git a/examples/kademlia/server_node_addr.txt b/examples/kademlia/server_node_addr.txt deleted file mode 100644 index 23f7025a2..000000000 --- a/examples/kademlia/server_node_addr.txt +++ /dev/null @@ -1 +0,0 @@ -/ip4/172.16.68.73/tcp/22943/p2p/16Uiu2HAmLipMqhDY6J6pbQzNJRVE9Ztt1qZfK6raPb5UDRwVM3Gz diff --git a/examples/metrics/README.md b/examples/metrics/README.md new file mode 100644 index 000000000..38bcbf2b7 --- /dev/null +++ b/examples/metrics/README.md @@ -0,0 +1,167 @@ +## Metrics Demo + +This example demonstrates how to run multiple libp2p services (Ping, Pubsub/Gossipsub, Kad-DHT) in a single node and observer +their behaviour through Prometheus + Grafana metrics dashboards. + +```bash +$ python -m pip install libp2p +Collecting libp2p +... +Successfully installed libp2p-x.x.x + +$ metrics-demo +Host multiaddr: /ip4/172.16.68.73/tcp/41173/p2p/12D3KooWD2DFvDs4wekLWU8sAUJJgivbRbiiKkX9yQ3kGhuCwCqL +Gossipsub and Pubsub services started !! +DHT service started with DHTMode.SERVER mode +Starting command executor loop... + +Prometheus metrics visible at: http://localhost:8000 + +To start prometheus and grafana dashboards, from another terminal: +PROMETHEUS_PORT=9001 GRAFANA_PORT=7001 docker compose up + +After this: +Prometheus dashboard will be visible at: http://localhost:9001 +Grafana dashboard will be visible at: http://localhost:7001 + +Entering intractive mode, type commands below. + +Available commands: +- connect - Connect to another peer +... +``` + +Now in this way a node can be started, now start another node in a different terminal +and make a connection between so that they can communicate: + +```bash +$ metrics-demo +$ connect /ip4/172.16.68.73/tcp/41173/p2p/12D3KooWD2DFvDs4wekLWU8sAUJJgivbRbiiKkX9yQ3kGhuCwCqL +Connected to 12D3KooWD2DFvDs4wekLWU8sAUJJgivbRbiiKkX9yQ3kGhuCwCqL +``` + +Now we can communicate between the 2 nodes via Ping, Gossipsub and Kad-DHT. Before that we have to +start the prometheus and grafana dashboards. For this create a `docker-compose.yml` file like this: + +```yml + services: + prometheus: + image: prom/prometheus:latest + container_name: prometheus + volumes: + - ./prometheus.yml:/etc/prometheus/prometheus.yml:ro + ports: + - "${PROMETHEUS_PORT}:9090" + extra_hosts: + - "host.docker.internal:host-gateway" + + grafana: + image: grafana/grafana:latest + container_name: grafana + ports: + - "${GRAFANA_PORT}:3000" + depends_on: + - prometheus +``` + +And run it like this + +```bash +PROMETHEUS_PORT=9001 GRAFANA_PORT=7001 docker compose up +``` + +A similar file is present in `py-libp2p/libp2p/metrics` directory also, so either create a new docker-compose +file or run it from the above path. This basically starts a prometheus and grafana server in your localhost, +with which the metrics can be viewed in graph format. + +Now see how to communicate between the 2 nodes, via Pubsub/Gossipsub, Ping and Kad-DHT + +### PING + +The following metrics are exposed in this service: + +- ping: Round-trip time sending a `ping` and receiving a `pong` +- ping_failure: Failure while sending a ping or receiving a ping + +```bash +$ ping /ip4/172.16.68.73/tcp/41173/p2p/12D3KooWD2DFvDs4wekLWU8sAUJJgivbRbiiKkX9yQ3kGhuCwCqL 15 +[401, 419, 428, 353, 354, 353, 369, 371, 353, 380, 352, 343, 378, 324, 412] +``` + +The output will the rtts took for each ping/ping to complete. +The updated metrics can be visualized in the dashboards. + +### Pubsub/Gossipsub + +The following metrics are exposed in this service: + +- gossipsub_received_total: Messages successfully received +- gossipsub_publish_total: Messages to be published +- gossipsub_subopts_total: Messages notifying peer subscriptions +- gossipsub_control_total: Received control messages +- gossipsub_message_bytes: Message size in bytes + +To communicate via gossipsub, join the same topics on both the nodes and publish messages +on that topic to get it received on both sides. + +```bash +$ join pubsub-chat +Subscribed to pubsub-chat +Starting receive loop +``` + +Do this on both the terminals. Then publish a message from one side, and see it recieved on the other side. + +```bash +$ publish pubsub-chat hello-from-pubsub! +``` + +See the updated metrics in the dashboards. + +### KAD-DHT + +The following metrics are exposed in this service: + +- kad_inbound_total: Total inbound requests received +- kad_inbound_find_node: Total inbound FIND_NODE requests received +- kad_inbound_get_value: Total inbound GET_VALUE requests received +- kad_inbound_put_value: Total inbound PUT_VALUE requests received +- kad_inbound_get_providers: Total inbound GET_PROVIDERS requests received +- kad_inbound_add_provider: Total inbound ADD_PROVIDER requests received + +To interact between the 2 nodes via kad-dht, we have 2 ways: + +- `PUT_VALUE` in one node, and `GET_VALUE` in another +- `ADD_PROVIDER` in one node, and `GET_PROVIDERS` in another + +#### PUT_VALUE/GET_VALUE + +```bash +$ put /exp/fa kad-dht-value +Stored value: kad-dht-value with key: /exp/fa + +# From another terminal +$ get /exp/fa +Retrieved value: kad-dht-value +``` + +#### ADD_PROVIDER/GET_PROVIDERS + +```bash +$ advertize content-id +Advertised as provider for content: content-id + +# From another terminal +$ get_provider content-id +Found 1 providers: [] +``` + +### SWARM-CONNECTION-EVENTS + +Other than the above 3 services, the incoming/outgoing connection cycle is also monitored via the +following metrics: + +- swarm_incoming_conn: Incoming connection received by libp2p-swarm +- swarm_incoming_conn_error: Incoming connection failure in libp2p-swarm +- swarm_dial_attempt: Dial attempts made by libp2p-swarm +- swarm_dial_attempt_error: Outgoing connection failure in libp2p-swarm diff --git a/examples/metrics/prometheus.yml b/examples/metrics/prometheus.yml deleted file mode 100644 index 9a5171884..000000000 --- a/examples/metrics/prometheus.yml +++ /dev/null @@ -1,9 +0,0 @@ -global: - scrape_interval: 5s - evaluation_interval: 5s - -scrape_configs: - - job_name: "python-app" - static_configs: - - targets: - - "localhost:58819" diff --git a/examples/metrics/runner.py b/examples/metrics/runner.py index 33e68527d..9a782565f 100644 --- a/examples/metrics/runner.py +++ b/examples/metrics/runner.py @@ -34,8 +34,10 @@ async def main() -> None: # METRICS metrics = Metrics() + metrics_recv_channel = node.host.get_metrics_recv_channel() + nursery.start_soon( - metrics.start_prometheus_server, node.host.metric_recv_channel + metrics.start_prometheus_server, metrics_recv_channel ) nursery.start_soon(node.command_executor, nursery) await trio.sleep(1) @@ -62,7 +64,7 @@ async def main() -> None: def cli() -> None: try: trio.run(main) - except* KeyboardInterrupt: + except KeyboardInterrupt: print("Session terminated by user") diff --git a/libp2p/__init__.py b/libp2p/__init__.py index f7d774676..61bab0230 100644 --- a/libp2p/__init__.py +++ b/libp2p/__init__.py @@ -296,7 +296,7 @@ def new_swarm( tls_server_config: ssl.SSLContext | None = None, resource_manager: ResourceManager | None = None, psk: str | None = None, - metric_send_channel: trio.MemorySendChannel | None = None + metric_send_channel: trio.MemorySendChannel[Any] | None = None ) -> INetworkService: logger.debug(f"new_swarm: enable_quic={enable_quic}, listen_addrs={listen_addrs}") """ diff --git a/libp2p/abc.py b/libp2p/abc.py index 67181b17e..9f48be7d7 100644 --- a/libp2p/abc.py +++ b/libp2p/abc.py @@ -327,7 +327,7 @@ class INetStream(ReadWriteCloser): """ muxed_conn: IMuxedConn - metric_send_channel: trio.MemorySendChannel | None + metric_send_channel: trio.MemorySendChannel[Any] | None @abstractmethod def get_protocol(self) -> TProtocol | None: @@ -2087,6 +2087,12 @@ def remove_stream_handler(self, protocol_id: TProtocol) -> None: """ + @abstractmethod + def get_metrics_recv_channel(self) -> trio.MemoryReceiveChannel[Any] | None: + """ + Returns the recving end of the channel, used for metric events + """ + @abstractmethod async def initiate_autotls_procedure(self, public_ip: str | None = None) -> None: """ @@ -2208,10 +2214,6 @@ async def upgrade_inbound_connection( """ - @abstractmethod - async def next_event(self) -> None: - """""" - # -------------------------- peer-record interface.py -------------------------- class IPeerRecord(ABC): diff --git a/libp2p/host/basic_host.py b/libp2p/host/basic_host.py index 012a89d0c..24bae288d 100644 --- a/libp2p/host/basic_host.py +++ b/libp2p/host/basic_host.py @@ -187,8 +187,7 @@ def __init__( default_protocols: OrderedDict[TProtocol, StreamHandlerFn] | None = None, negotiate_timeout: int = DEFAULT_NEGOTIATE_TIMEOUT, resource_manager: ResourceManager | None = None, - psk: str | None = None, - metric_recv_channel: trio.MemoryReceiveChannel | None = None, + metric_recv_channel: trio.MemoryReceiveChannel[Any] | None = None, *, bootstrap_allow_ipv6: bool = False, bootstrap_dns_timeout: float = 10.0, @@ -251,7 +250,6 @@ def __init__( dns_resolution_timeout=bootstrap_dns_timeout, dns_max_retries=bootstrap_dns_max_retries, ) - self.psk = psk # Cache a signed-record if the local-node in the PeerStore envelope = create_signed_peer_record( @@ -490,6 +488,12 @@ def _preferred_protocol( ) return None + def get_metrics_recv_channel(self) -> trio.MemoryReceiveChannel[Any] | None: + """ + Returns the recving end of the channel, used for metric events + """ + return self.metric_recv_channel + async def initiate_autotls_procedure(self, public_ip: str | None = None) -> None: """ Run the AutoTLS certificate provisioning flow for this host. @@ -902,10 +906,6 @@ async def connect(self, peer_info: PeerInfo) -> None: # Kick off identify in the background so protocol caching can engage. self._schedule_identify(peer_info.peer_id, reason="connect") - async def next_event(self): - event = await self.metric_recv_channel.receive() - return event - async def _run_identify(self, peer_id: ID) -> None: """ Run identify protocol with a peer to discover supported protocols. diff --git a/libp2p/host/ping.py b/libp2p/host/ping.py index eb45ab540..e9a506348 100644 --- a/libp2p/host/ping.py +++ b/libp2p/host/ping.py @@ -118,23 +118,24 @@ def __init__(self, host: IHost): async def ping(self, peer_id: PeerID, ping_amt: int = 1) -> list[int]: stream = await self._host.new_stream(peer_id, [ID]) + rtts: list[int] + event: PingEvent + try: rtts = [await _ping(stream) for _ in range(ping_amt)] - await stream.close() - event = PingEvent( peer_id=peer_id, rtts=rtts, failure_error=None, ) - return rtts - except Exception as error: - await stream.close() - event = PingEvent(peer_id=peer_id, rtts=None, failure_error=error) raise finally: - await stream.metric_send_channel.send(event) + await stream.close() + if stream.metric_send_channel is not None: + await stream.metric_send_channel.send(event) + + return rtts diff --git a/libp2p/host/routed_host.py b/libp2p/host/routed_host.py index 2a49f62ce..13c0c5fdd 100644 --- a/libp2p/host/routed_host.py +++ b/libp2p/host/routed_host.py @@ -33,7 +33,6 @@ def __init__( router: IPeerRouting, enable_mDNS: bool = False, enable_upnp: bool = False, - enable_autotls: bool = False, bootstrap: list[str] | None = None, resource_manager: ResourceManager | None = None, *, @@ -57,11 +56,10 @@ def __init__( :param bootstrap_dns_max_retries: Max DNS resolution retries (with backoff). """ super().__init__( - network, - enable_mDNS, - enable_upnp, - enable_autotls, - bootstrap, + network=network, + enable_mDNS=enable_mDNS, + enable_upnp=enable_upnp, + bootstrap=bootstrap, resource_manager=resource_manager, bootstrap_allow_ipv6=bootstrap_allow_ipv6, bootstrap_dns_timeout=bootstrap_dns_timeout, diff --git a/libp2p/kad_dht/kad_dht.py b/libp2p/kad_dht/kad_dht.py index c271ad404..dc059b1ff 100644 --- a/libp2p/kad_dht/kad_dht.py +++ b/libp2p/kad_dht/kad_dht.py @@ -485,7 +485,7 @@ async def handle_stream(self, stream: INetStream) -> None: ) event = KadDhtEvent() - event.peer_id = peer_id + event.peer_id = peer_id.pretty() event.inbound = True # Handle FIND_NODE message diff --git a/libp2p/metrics/gossipsub.py b/libp2p/metrics/gossipsub.py index a20874e60..b2dd4226e 100644 --- a/libp2p/metrics/gossipsub.py +++ b/libp2p/metrics/gossipsub.py @@ -11,9 +11,9 @@ class GossipsubMetrics: received: Counter msg_size: Histogram - def __init__(self): + def __init__(self) -> None: self.received = Counter( - "gossipsub_receiived_total", + "gossipsub_received_total", "Messages successfully received", labelnames=["peer_id"], ) diff --git a/libp2p/metrics/kad_dht.py b/libp2p/metrics/kad_dht.py index a8d30f10d..aae969d0d 100644 --- a/libp2p/metrics/kad_dht.py +++ b/libp2p/metrics/kad_dht.py @@ -20,7 +20,7 @@ class KadDhtMetrics: get_providers: Counter add_provider: Counter - def __init__(self): + def __init__(self) -> None: self.inbound = Counter( "kad_inbound_total", "Total inbound requests received", diff --git a/libp2p/metrics/metrics.py b/libp2p/metrics/metrics.py index 175c81317..ccd4d27e1 100644 --- a/libp2p/metrics/metrics.py +++ b/libp2p/metrics/metrics.py @@ -1,4 +1,5 @@ import socket +from typing import Any from prometheus_client import start_http_server import trio @@ -23,11 +24,16 @@ def find_available_port(start_port: int = 8000, host: str = "127.0.0.1") -> int: except OSError: port += 1 + raise RuntimeError("Unreachable") + class Metrics: ping: PingMetrics + gossipsub: GossipsubMetrics + kad_dht: KadDhtMetrics + swarm: SwarmMetrics - def __init__(self): + def __init__(self) -> None: self.ping = PingMetrics() self.gossipsub = GossipsubMetrics() self.kad_dht = KadDhtMetrics() @@ -35,23 +41,22 @@ def __init__(self): async def start_prometheus_server( self, - metric_recv_channel: trio.MemoryReceiveChannel, + metric_recv_channel: trio.MemoryReceiveChannel[Any], ) -> None: metrics = find_available_port(8000) - prometheus_dashboard = find_available_port(9000) - grafana_dashboard = find_available_port(7000) + prometheus = find_available_port(9000) + grafana = find_available_port(7000) start_http_server(metrics) print(f"\nPrometheus metrics visible at: http://localhost:{metrics}") - print( - f"Prometheus dashboard visible at: http://localhost:{prometheus_dashboard}" - ) - print(f"Grafana dashboard visible at: http://localhost:{grafana_dashboard}\n") print( - "\nStart prometheus and grafana dashboard, for another terminal: \n" - f"PROMETHEUS_PORT={prometheus_dashboard} GRAFANA_PORT={grafana_dashboard} docker compose up\n" + "\nTo start prometheus and grafana dashboards, from another terminal: \n" + f"PROMETHEUS_PORT={prometheus} GRAFANA_PORT={grafana} docker compose up\n" + "\nAfter this:\n" + f"Prometheus dashboard will be visible at: http://localhost:{prometheus}\n" + f"Grafana dashboard will be visible at: http://localhost:{grafana}\n" ) while True: diff --git a/libp2p/metrics/ping.py b/libp2p/metrics/ping.py index 5e4652f30..81dbfb0aa 100644 --- a/libp2p/metrics/ping.py +++ b/libp2p/metrics/ping.py @@ -7,7 +7,7 @@ class PingMetrics: rtt: Histogram failures: Counter - def __init__(self): + def __init__(self) -> None: rtt = Histogram( "ping", "round-trip time sending a 'ping' and receiving a 'pong'", @@ -16,7 +16,7 @@ def __init__(self): failures = Counter( "ping_failure", - "FAilure while sending a ping or receiving a ping", + "Failure while sending a ping or receiving a ping", labelnames=["reason", "peer_id"], ) diff --git a/libp2p/metrics/swarm.py b/libp2p/metrics/swarm.py index fa868e206..1e5085162 100644 --- a/libp2p/metrics/swarm.py +++ b/libp2p/metrics/swarm.py @@ -28,7 +28,7 @@ class SwarmMetrics: dial_attempt: Counter dial_attempt_error: Counter - def __init__(self): + def __init__(self) -> None: self.conn_incoming = Counter( "swarm_incoming_conn", "Incoming connection received by libp2p-swarm", diff --git a/libp2p/network/connection/swarm_connection.py b/libp2p/network/connection/swarm_connection.py index 77e035e0b..d23b0a7e9 100644 --- a/libp2p/network/connection/swarm_connection.py +++ b/libp2p/network/connection/swarm_connection.py @@ -42,7 +42,7 @@ class SwarmConn(INetConn): _direction: Direction _actual_transport_addresses: list[Multiaddr] | None _connection_type: ConnectionType - _metric_send_channel: trio.MemorySendChannel | None = None + _metric_send_channel: trio.MemorySendChannel[Any] | None = None def __init__( self, diff --git a/libp2p/network/stream/net_stream.py b/libp2p/network/stream/net_stream.py index 52501b38e..433ac831c 100644 --- a/libp2p/network/stream/net_stream.py +++ b/libp2p/network/stream/net_stream.py @@ -5,6 +5,7 @@ import logging from typing import ( TYPE_CHECKING, + Any, ) import trio @@ -122,13 +123,13 @@ class NetStream(INetStream): muxed_stream: IMuxedStream protocol_id: TProtocol | None - metric_send_channel: trio.MemorySendChannel | None + metric_send_channel: trio.MemorySendChannel[Any] | None = None def __init__( self, muxed_stream: IMuxedStream, swarm_conn: "SwarmConn | None", - metric_send_channel: trio.MemorySendChannel | None, + metric_send_channel: trio.MemorySendChannel[Any] | None, ) -> None: self.muxed_stream = muxed_stream self.muxed_conn = muxed_stream.muxed_conn diff --git a/libp2p/network/swarm.py b/libp2p/network/swarm.py index 4aaccf5ca..d44ba845e 100644 --- a/libp2p/network/swarm.py +++ b/libp2p/network/swarm.py @@ -126,7 +126,7 @@ def __init__( retry_config: RetryConfig | None = None, connection_config: ConnectionConfig | QUICTransportConfig | None = None, psk: str | None = None, - metric_send_channel: trio.MemorySendChannel | None = None, + metric_send_channel: trio.MemorySendChannel[Any] | None = None, ): self.self_id = peer_id self.peerstore = peerstore @@ -493,7 +493,7 @@ async def dial_peer(self, peer_id: ID) -> list[INetConn]: """ # Emit metric-event for dial-attempt event = SwarmEvent() - event.peer_id = peer_id + event.peer_id = peer_id.pretty() event.dial_attempt = True if self.metric_send_channel is not None: @@ -558,7 +558,7 @@ async def dial_peer(self, peer_id: ID) -> list[INetConn]: # Emit metric-event for dial_attempt failure event = SwarmEvent() - event.peer_id = peer_id + event.peer_id = peer_id.pretty() event.dial_attempt_error = True if self.metric_send_channel is not None: diff --git a/libp2p/pubsub/pubsub.py b/libp2p/pubsub/pubsub.py index 0dd517eb9..3ef6781df 100644 --- a/libp2p/pubsub/pubsub.py +++ b/libp2p/pubsub/pubsub.py @@ -490,7 +490,7 @@ async def continuously_read_stream(self, stream: INetStream) -> None: continue event = GossipsubEvent() - event.peer_id = peer_id + event.peer_id = peer_id.pretty() event.message_size = len(incoming) if rpc_incoming.publish: diff --git a/newsfragments/1199.rst b/newsfragments/1199.rst new file mode 100644 index 000000000..69a6a7757 --- /dev/null +++ b/newsfragments/1199.rst @@ -0,0 +1 @@ +Added the metrics module to monitor internal service activities via Prometheus/Grafana dashboards. diff --git a/tests/core/network/test_net_stream_concurrency.py b/tests/core/network/test_net_stream_concurrency.py index dceae8b99..5ec09904f 100644 --- a/tests/core/network/test_net_stream_concurrency.py +++ b/tests/core/network/test_net_stream_concurrency.py @@ -5,11 +5,13 @@ access to stream state and prevent race conditions. """ +from typing import cast from unittest.mock import Mock import pytest import trio +from libp2p.abc import IMuxedStream from libp2p.network.stream.net_stream import NetStream, StreamState @@ -59,7 +61,11 @@ def reset(self): def mock_stream(): """Create a mock NetStream for testing.""" muxed_stream = MockMuxedStream() - stream = NetStream(muxed_stream=muxed_stream, swarm_conn=Mock()) # type: ignore[arg-type] + stream = NetStream( + muxed_stream=cast(IMuxedStream, muxed_stream), + swarm_conn=Mock(), + metric_send_channel=None, + ) # type: ignore[arg-type] return stream, muxed_stream diff --git a/tests/core/network/test_net_stream_error_state.py b/tests/core/network/test_net_stream_error_state.py index 398cdd2ec..8faf8b7a4 100644 --- a/tests/core/network/test_net_stream_error_state.py +++ b/tests/core/network/test_net_stream_error_state.py @@ -70,7 +70,7 @@ async def __aenter__(self) -> "IMuxedStream": def mock_stream(): """Create a mock stream for testing.""" muxed_stream = MockMuxedStream() - return NetStream(muxed_stream, None) + return NetStream(muxed_stream, None, None) @pytest.mark.trio @@ -190,7 +190,7 @@ async def test_is_operational_with_open_state(mock_stream): async def test_error_state_lifecycle(): """Test complete ERROR state lifecycle.""" muxed_stream = MockMuxedStream() - stream = NetStream(muxed_stream, None) + stream = NetStream(muxed_stream, None, None) # Start in INIT state assert await stream.state == StreamState.INIT diff --git a/tests/core/network/test_net_stream_state_transitions.py b/tests/core/network/test_net_stream_state_transitions.py index b43da2962..3aa1170c6 100644 --- a/tests/core/network/test_net_stream_state_transitions.py +++ b/tests/core/network/test_net_stream_state_transitions.py @@ -42,7 +42,7 @@ async def __aenter__(self) -> "IMuxedStream": def mock_stream(): """Create a mock stream for testing.""" muxed_stream = MockMuxedStream() - return NetStream(muxed_stream, None) + return NetStream(muxed_stream, None, None) @pytest.mark.trio From 3bfce786cd99217fba4548e935290097b1902013 Mon Sep 17 00:00:00 2001 From: Abhinav Agarwalla Date: Sun, 22 Mar 2026 21:43:00 +0530 Subject: [PATCH 15/16] added newsfragment file --- newsfragments/{1199.rst => 1199.feature.rst} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename newsfragments/{1199.rst => 1199.feature.rst} (100%) diff --git a/newsfragments/1199.rst b/newsfragments/1199.feature.rst similarity index 100% rename from newsfragments/1199.rst rename to newsfragments/1199.feature.rst From 66fd7d682974d67b855dc7efab619db1182fc88d Mon Sep 17 00:00:00 2001 From: Abhinav Agarwalla Date: Mon, 23 Mar 2026 10:33:17 +0530 Subject: [PATCH 16/16] chore: remove redundancies --- docs/examples.metrics.rst | 15 +++++++++++++++ examples/metrics/coordinator.py | 5 +++++ libp2p/metrics/kad_dht.py | 9 --------- libp2p/metrics/swarm.py | 7 ------- libp2p/pubsub/gossipsub.py | 1 + tests/core/network/test_stream_semaphore.py | 2 +- 6 files changed, 22 insertions(+), 17 deletions(-) diff --git a/docs/examples.metrics.rst b/docs/examples.metrics.rst index 6065a719e..b509406a9 100644 --- a/docs/examples.metrics.rst +++ b/docs/examples.metrics.rst @@ -65,6 +65,20 @@ start the prometheus and grafana dashboards. For this create a `docker-compose.y depends_on: - prometheus +And a `prometheus.yml` file like this: + +.. code-block:: console + + global: + scrape_interval: 5s + + scrape_configs: + - job_name: "libp2p-python" + static_configs: + - targets: + - "host.docker.internal:8000" + + And run it like this .. code-block:: console @@ -73,6 +87,7 @@ And run it like this A similar file is present in `py-libp2p/libp2p/metrics` directory also, so either create a new docker-compose file or run it from the above path. This basically starts a prometheus and grafana server in your localhost, with which the metrics can be viewed in graph format. +Remember that the dashboards will be created for the node, whose prometheus metric-server is running on port `8000`. Now see how to communicate between the 2 nodes, via Pubsub/Gossipsub, Ping and Kad-DHT diff --git a/examples/metrics/coordinator.py b/examples/metrics/coordinator.py index 9113e9051..48ac5c4e4 100644 --- a/examples/metrics/coordinator.py +++ b/examples/metrics/coordinator.py @@ -20,13 +20,18 @@ Available commands: - connect - Connect to another peer - ping - Ping to another peer + +GOSSIPSUB - join - Subscribe to a topic - leave - Unsubscribe to a topic - publish - Publish a message + +KAD-DHT - put - Execute PUT_VALUE in DHT - get - Execute GET_VALUE in DHT - advertize - Execute ADD_PROVIDER in DHT - get_provider - Execute GET_PROVIDERS in DHT + - local - List local multiaddr - help - List the existing commands - exit - Shut down diff --git a/libp2p/metrics/kad_dht.py b/libp2p/metrics/kad_dht.py index aae969d0d..7f8d1e3e2 100644 --- a/libp2p/metrics/kad_dht.py +++ b/libp2p/metrics/kad_dht.py @@ -2,15 +2,6 @@ from libp2p.kad_dht.kad_dht import KadDhtEvent -# COUNTER - -# INBOUND_REQ -# FIND_NODE -# GET_VALUE -# PUT_VALUE -# GET_PROVIDERS -# ADD_PROVIDERS - class KadDhtMetrics: inbound: Counter diff --git a/libp2p/metrics/swarm.py b/libp2p/metrics/swarm.py index 1e5085162..88e1b1573 100644 --- a/libp2p/metrics/swarm.py +++ b/libp2p/metrics/swarm.py @@ -1,12 +1,5 @@ from prometheus_client import Counter -# METRICS - -# conn_incoming -# conn_incoming_error -# dial_attemp -# conn_outgoing_error - class SwarmEvent: peer_id: str | None = None diff --git a/libp2p/pubsub/gossipsub.py b/libp2p/pubsub/gossipsub.py index 027275bc3..2f3bfeabc 100644 --- a/libp2p/pubsub/gossipsub.py +++ b/libp2p/pubsub/gossipsub.py @@ -78,6 +78,7 @@ _MAX_PENDING_GRAFT_PRUNE_PER_PEER = 64 + class GossipSub(IPubsubRouter, Service): protocols: list[TProtocol] pubsub: Pubsub | None diff --git a/tests/core/network/test_stream_semaphore.py b/tests/core/network/test_stream_semaphore.py index 1dde5ee62..8eb657dfe 100644 --- a/tests/core/network/test_stream_semaphore.py +++ b/tests/core/network/test_stream_semaphore.py @@ -44,7 +44,7 @@ def _mock_net_stream(swarm_conn: Mock | None = None) -> NetStream: muxed_stream.close = AsyncMock() muxed_stream.reset = AsyncMock() - ns = NetStream(muxed_stream, swarm_conn) + ns = NetStream(muxed_stream, swarm_conn, None) return ns