Skip to content

Commit 2dd2ac3

Browse files
MadLittleModsitsoyou
authored andcommitted
Re-introduce: Fix LaterGauge metrics to collect from all servers (#18791)
Re-introduce: element-hq/synapse#18751 that was reverted in element-hq/synapse#18789 (explains why the PR was reverted in the first place). - Adds a `cleanup` pattern that cleans up metrics from each homeserver in the tests. Previously, the list of hooks built up until our CI machines couldn't operate properly, see element-hq/synapse#18789 - Fix long-standing issue with `synapse_background_update_status` metrics only tracking the last database listed in the config (see element-hq/synapse#18791 (comment))
1 parent 2cd9462 commit 2dd2ac3

File tree

20 files changed

+435
-136
lines changed

20 files changed

+435
-136
lines changed

changelog.d/18791.misc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix `LaterGauge` metrics to collect from all servers.

synapse/_scripts/generate_workers_map.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -153,9 +153,13 @@ def get_registered_paths_for_default(
153153
"""
154154

155155
hs = MockHomeserver(base_config, worker_app)
156+
156157
# TODO We only do this to avoid an error, but don't need the database etc
157158
hs.setup()
158-
return get_registered_paths_for_hs(hs)
159+
registered_paths = get_registered_paths_for_hs(hs)
160+
hs.cleanup()
161+
162+
return registered_paths
159163

160164

161165
def elide_http_methods_if_unconflicting(

synapse/_scripts/synapse_port_db.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@
9999
from synapse.storage.prepare_database import prepare_database
100100
from synapse.types import ISynapseReactor
101101
from synapse.util import SYNAPSE_VERSION, Clock
102+
from synapse.util.stringutils import random_string
102103

103104
# Cast safety: Twisted does some naughty magic which replaces the
104105
# twisted.internet.reactor module with a Reactor instance at runtime.
@@ -323,13 +324,17 @@ def __init__(self, config: HomeServerConfig):
323324
self.config = config
324325
self.hostname = config.server.server_name
325326
self.version_string = SYNAPSE_VERSION
327+
self.instance_id = random_string(5)
326328

327329
def get_clock(self) -> Clock:
328330
return self.clock
329331

330332
def get_reactor(self) -> ISynapseReactor:
331333
return reactor
332334

335+
def get_instance_id(self) -> str:
336+
return self.instance_id
337+
333338
def get_instance_name(self) -> str:
334339
return "master"
335340

@@ -685,7 +690,15 @@ def build_db_store(
685690
)
686691
prepare_database(db_conn, engine, config=self.hs_config)
687692
# Type safety: ignore that we're using Mock homeservers here.
688-
store = Store(DatabasePool(hs, db_config, engine), db_conn, hs) # type: ignore[arg-type]
693+
store = Store(
694+
DatabasePool(
695+
hs, # type: ignore[arg-type]
696+
db_config,
697+
engine,
698+
),
699+
db_conn,
700+
hs, # type: ignore[arg-type]
701+
)
689702
db_conn.commit()
690703

691704
return store

synapse/federation/send_queue.py

Lines changed: 28 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
"""
3838

3939
import logging
40+
from enum import Enum
4041
from typing import (
4142
TYPE_CHECKING,
4243
Dict,
@@ -67,6 +68,25 @@
6768
logger = logging.getLogger(__name__)
6869

6970

71+
class QueueNames(str, Enum):
72+
PRESENCE_MAP = "presence_map"
73+
KEYED_EDU = "keyed_edu"
74+
KEYED_EDU_CHANGED = "keyed_edu_changed"
75+
EDUS = "edus"
76+
POS_TIME = "pos_time"
77+
PRESENCE_DESTINATIONS = "presence_destinations"
78+
79+
80+
queue_name_to_gauge_map: Dict[QueueNames, LaterGauge] = {}
81+
82+
for queue_name in QueueNames:
83+
queue_name_to_gauge_map[queue_name] = LaterGauge(
84+
name=f"synapse_federation_send_queue_{queue_name.value}_size",
85+
desc="",
86+
labelnames=[SERVER_NAME_LABEL],
87+
)
88+
89+
7090
class FederationRemoteSendQueue(AbstractFederationSender):
7191
"""A drop in replacement for FederationSender"""
7292

@@ -111,23 +131,16 @@ def __init__(self, hs: "HomeServer"):
111131
# we make a new function, so we need to make a new function so the inner
112132
# lambda binds to the queue rather than to the name of the queue which
113133
# changes. ARGH.
114-
def register(name: str, queue: Sized) -> None:
115-
LaterGauge(
116-
name="synapse_federation_send_queue_%s_size" % (queue_name,),
117-
desc="",
118-
labelnames=[SERVER_NAME_LABEL],
119-
caller=lambda: {(self.server_name,): len(queue)},
134+
def register(queue_name: QueueNames, queue: Sized) -> None:
135+
queue_name_to_gauge_map[queue_name].register_hook(
136+
homeserver_instance_id=hs.get_instance_id(),
137+
hook=lambda: {(self.server_name,): len(queue)},
120138
)
121139

122-
for queue_name in [
123-
"presence_map",
124-
"keyed_edu",
125-
"keyed_edu_changed",
126-
"edus",
127-
"pos_time",
128-
"presence_destinations",
129-
]:
130-
register(queue_name, getattr(self, queue_name))
140+
for queue_name in QueueNames:
141+
queue = getattr(self, queue_name.value)
142+
assert isinstance(queue, Sized)
143+
register(queue_name, queue=queue)
131144

132145
self.clock.looping_call(self._clear_queue, 30 * 1000)
133146

synapse/federation/sender/__init__.py

Lines changed: 27 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,24 @@
199199
labelnames=[SERVER_NAME_LABEL],
200200
)
201201

202+
transaction_queue_pending_destinations_gauge = LaterGauge(
203+
name="synapse_federation_transaction_queue_pending_destinations",
204+
desc="",
205+
labelnames=[SERVER_NAME_LABEL],
206+
)
207+
208+
transaction_queue_pending_pdus_gauge = LaterGauge(
209+
name="synapse_federation_transaction_queue_pending_pdus",
210+
desc="",
211+
labelnames=[SERVER_NAME_LABEL],
212+
)
213+
214+
transaction_queue_pending_edus_gauge = LaterGauge(
215+
name="synapse_federation_transaction_queue_pending_edus",
216+
desc="",
217+
labelnames=[SERVER_NAME_LABEL],
218+
)
219+
202220
# Time (in s) to wait before trying to wake up destinations that have
203221
# catch-up outstanding.
204222
# Please note that rate limiting still applies, so while the loop is
@@ -398,34 +416,27 @@ def __init__(self, hs: "HomeServer"):
398416
# map from destination to PerDestinationQueue
399417
self._per_destination_queues: Dict[str, PerDestinationQueue] = {}
400418

401-
LaterGauge(
402-
name="synapse_federation_transaction_queue_pending_destinations",
403-
desc="",
404-
labelnames=[SERVER_NAME_LABEL],
405-
caller=lambda: {
419+
transaction_queue_pending_destinations_gauge.register_hook(
420+
homeserver_instance_id=hs.get_instance_id(),
421+
hook=lambda: {
406422
(self.server_name,): sum(
407423
1
408424
for d in self._per_destination_queues.values()
409425
if d.transmission_loop_running
410426
)
411427
},
412428
)
413-
414-
LaterGauge(
415-
name="synapse_federation_transaction_queue_pending_pdus",
416-
desc="",
417-
labelnames=[SERVER_NAME_LABEL],
418-
caller=lambda: {
429+
transaction_queue_pending_pdus_gauge.register_hook(
430+
homeserver_instance_id=hs.get_instance_id(),
431+
hook=lambda: {
419432
(self.server_name,): sum(
420433
d.pending_pdu_count() for d in self._per_destination_queues.values()
421434
)
422435
},
423436
)
424-
LaterGauge(
425-
name="synapse_federation_transaction_queue_pending_edus",
426-
desc="",
427-
labelnames=[SERVER_NAME_LABEL],
428-
caller=lambda: {
437+
transaction_queue_pending_edus_gauge.register_hook(
438+
homeserver_instance_id=hs.get_instance_id(),
439+
hook=lambda: {
429440
(self.server_name,): sum(
430441
d.pending_edu_count() for d in self._per_destination_queues.values()
431442
)

synapse/handlers/presence.py

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,18 @@
173173
labelnames=["locality", "from", "to", SERVER_NAME_LABEL],
174174
)
175175

176+
presence_user_to_current_state_size_gauge = LaterGauge(
177+
name="synapse_handlers_presence_user_to_current_state_size",
178+
desc="",
179+
labelnames=[SERVER_NAME_LABEL],
180+
)
181+
182+
presence_wheel_timer_size_gauge = LaterGauge(
183+
name="synapse_handlers_presence_wheel_timer_size",
184+
desc="",
185+
labelnames=[SERVER_NAME_LABEL],
186+
)
187+
176188
# If a user was last active in the last LAST_ACTIVE_GRANULARITY, consider them
177189
# "currently_active"
178190
LAST_ACTIVE_GRANULARITY = 60 * 1000
@@ -779,11 +791,9 @@ def __init__(self, hs: "HomeServer"):
779791
EduTypes.PRESENCE, self.incoming_presence
780792
)
781793

782-
LaterGauge(
783-
name="synapse_handlers_presence_user_to_current_state_size",
784-
desc="",
785-
labelnames=[SERVER_NAME_LABEL],
786-
caller=lambda: {(self.server_name,): len(self.user_to_current_state)},
794+
presence_user_to_current_state_size_gauge.register_hook(
795+
homeserver_instance_id=hs.get_instance_id(),
796+
hook=lambda: {(self.server_name,): len(self.user_to_current_state)},
787797
)
788798

789799
# The per-device presence state, maps user to devices to per-device presence state.
@@ -882,11 +892,9 @@ def __init__(self, hs: "HomeServer"):
882892
60 * 1000,
883893
)
884894

885-
LaterGauge(
886-
name="synapse_handlers_presence_wheel_timer_size",
887-
desc="",
888-
labelnames=[SERVER_NAME_LABEL],
889-
caller=lambda: {(self.server_name,): len(self.wheel_timer)},
895+
presence_wheel_timer_size_gauge.register_hook(
896+
homeserver_instance_id=hs.get_instance_id(),
897+
hook=lambda: {(self.server_name,): len(self.wheel_timer)},
890898
)
891899

892900
# Used to handle sending of presence to newly joined users/servers

synapse/http/request_metrics.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -164,11 +164,13 @@ def _get_in_flight_counts() -> Mapping[Tuple[str, ...], int]:
164164
return counts
165165

166166

167-
LaterGauge(
167+
in_flight_requests = LaterGauge(
168168
name="synapse_http_server_in_flight_requests_count",
169169
desc="",
170170
labelnames=["method", "servlet", SERVER_NAME_LABEL],
171-
caller=_get_in_flight_counts,
171+
)
172+
in_flight_requests.register_hook(
173+
homeserver_instance_id=None, hook=_get_in_flight_counts
172174
)
173175

174176

0 commit comments

Comments
 (0)