1
1
import asyncio
2
+ import atexit
2
3
import os
3
4
import signal
4
5
import sys
6
+ import time
5
7
from multiprocessing .managers import AcquirerProxy
8
+ from multiprocessing .synchronize import Event
6
9
7
10
import netaddr
11
+ import psutil
8
12
import requests
9
13
import torch
10
14
import torch .multiprocessing as mp
@@ -102,6 +106,7 @@ def start_api(
102
106
scoring_queue : list ,
103
107
reward_events : list ,
104
108
miners_dict : dict ,
109
+ event_stop : Event ,
105
110
):
106
111
from prompting .api .api import start_scoring_api # noqa: F401
107
112
@@ -124,7 +129,7 @@ async def start():
124
129
logger .warning (f"Failed to serve scoring api to chain: { e } " )
125
130
await start_scoring_api (task_scorer , scoring_queue , reward_events , miners_dict )
126
131
127
- while True :
132
+ while not event_stop . is_set () :
128
133
await asyncio .sleep (10 )
129
134
130
135
asyncio .run (start ())
@@ -134,6 +139,7 @@ def start_task_sending_loop(
134
139
task_queue : list ,
135
140
scoring_queue : list ,
136
141
miners_dict : dict ,
142
+ event_stop : Event ,
137
143
):
138
144
async def spawn_loops (task_queue , scoring_queue , miners_dict : dict ):
139
145
from prompting .tasks .task_sending import TaskSender
@@ -142,7 +148,8 @@ async def spawn_loops(task_queue, scoring_queue, miners_dict: dict):
142
148
task_sender = TaskSender ()
143
149
asyncio .create_task (task_sender .start (task_queue , scoring_queue , miners_dict , simultaneous_loops = 1 ))
144
150
logger .debug ("Task sending loop started" )
145
- while True :
151
+
152
+ while not event_stop .is_set ():
146
153
await asyncio .sleep (5 )
147
154
logger .debug ("Task sending loop is running" )
148
155
@@ -155,13 +162,13 @@ async def spawn_loops(task_queue, scoring_queue, miners_dict: dict):
155
162
raise
156
163
157
164
158
- def start_availability_checking_loop (miners_dict : dict ):
165
+ def start_availability_checking_loop (miners_dict : dict , event_stop : Event ):
159
166
async def spawn_loops (miners_dict : dict ):
160
167
from prompting .miner_availability .miner_availability import availability_checking_loop
161
168
162
169
logger .info ("Starting availability checking loop in validator..." )
163
170
asyncio .create_task (availability_checking_loop .start (miners_dict ))
164
- while True :
171
+ while not event_stop . is_set () :
165
172
await asyncio .sleep (5 )
166
173
logger .debug ("Availability checking loop is running" )
167
174
@@ -174,13 +181,13 @@ async def spawn_loops(miners_dict: dict):
174
181
raise
175
182
176
183
177
- def start_weight_setter_loop (reward_events ):
184
+ def start_weight_setter_loop (reward_events , event_stop : Event ):
178
185
async def spawn_loops (reward_events ):
179
186
from prompting .weight_setting .weight_setter import weight_setter
180
187
181
188
logger .info ("Starting weight setter loop in validator..." )
182
189
asyncio .create_task (weight_setter .start (reward_events ))
183
- while True :
190
+ while not event_stop . is_set () :
184
191
await asyncio .sleep (5 )
185
192
logger .debug ("Weight setter loop is running" )
186
193
@@ -193,6 +200,34 @@ async def spawn_loops(reward_events):
193
200
raise
194
201
195
202
203
+ def health_check (parent_pid : int , event_stop : Event ):
204
+ """Monitor parent process and kill all child processes in case of emergency."""
205
+ step = 0
206
+ while True :
207
+ try :
208
+ if not psutil .pid_exists (parent_pid ):
209
+ event_stop .set ()
210
+ logger .warning ("Parent process died, killing all child processes" )
211
+ os .killpg (0 , signal .SIGKILL )
212
+
213
+ block = settings .shared_settings .block
214
+ if block - settings .shared_settings .METAGRAPH .last_update [settings .shared_settings .UID ] > 320 and step > 60 :
215
+ event_stop .set ()
216
+ last_update_block = settings .shared_settings .METAGRAPH .last_update [settings .shared_settings .UID ]
217
+ logger .warning (
218
+ f"Metagraph hasn't been updated for { block - last_update_block } blocks. "
219
+ f"Staled block: { block } , Last update: { last_update_block } "
220
+ )
221
+ os .killpg (0 , signal .SIGKILL )
222
+ step += 1
223
+
224
+ except Exception as e :
225
+ logger .error (f"Failed to kill process group: { e } " )
226
+ finally :
227
+ sys .exit (1 )
228
+ time .sleep (60 )
229
+
230
+
196
231
async def main (
197
232
cache_rewards : list | None = None ,
198
233
cache_scores : list | None = None ,
@@ -208,6 +243,7 @@ async def main(
208
243
mp_lock = manager .Lock ()
209
244
processes : list [mp .Process ] = []
210
245
tasks : list [asyncio .Task ] = []
246
+ event_stop = mp .Event ()
211
247
212
248
model_scheduler = AsyncModelScheduler (llm_model_manager = ModelManager (), mp_lock = mp_lock , sync = True )
213
249
@@ -216,15 +252,19 @@ async def main(
216
252
if settings .shared_settings .DEPLOY_SCORING_API and not settings .shared_settings .NEURON_DISABLE_SET_WEIGHTS :
217
253
# Use multiprocessing to bypass API blocking issue
218
254
api_process = mp .Process (
219
- target = start_api , args = (scoring_queue , reward_events , miners_dict ), name = "APIProcess"
255
+ target = start_api ,
256
+ args = (scoring_queue , reward_events , miners_dict , event_stop ),
257
+ name = "APIProcess" ,
258
+ daemon = True ,
220
259
)
221
260
api_process .start ()
222
261
processes .append (api_process )
223
262
224
263
availability_process = mp .Process (
225
264
target = start_availability_checking_loop ,
226
- args = (miners_dict ,),
265
+ args = (miners_dict , event_stop ),
227
266
name = "AvailabilityProcess" ,
267
+ daemon = True ,
228
268
)
229
269
availability_process .start ()
230
270
processes .append (availability_process )
@@ -243,62 +283,73 @@ async def main(
243
283
244
284
sending_task = mp .Process (
245
285
target = start_task_sending_loop ,
246
- args = (task_queue , scoring_queue , miners_dict ),
286
+ args = (task_queue , scoring_queue , miners_dict , event_stop ),
247
287
name = "SendingTaskProcess" ,
288
+ daemon = True ,
248
289
)
249
290
sending_task .start ()
250
291
processes .append (sending_task )
251
292
252
293
weight_setter_process = mp .Process (
253
294
target = start_weight_setter_loop ,
254
- args = (reward_events ,),
295
+ args = (reward_events , event_stop ),
255
296
name = "WeightSetterProcess" ,
297
+ daemon = True ,
256
298
)
257
299
weight_setter_process .start ()
258
300
processes .append (weight_setter_process )
259
301
260
- GPUInfo .log_gpu_info ()
302
+ health_check_process = mp .Process (
303
+ target = health_check ,
304
+ args = (os .getpid (), event_stop ),
305
+ name = "HealthCheckProcess" ,
306
+ daemon = True ,
307
+ )
308
+ health_check_process .start ()
309
+ processes .append (health_check_process )
261
310
262
- step = 0
311
+ GPUInfo . log_gpu_info ()
263
312
while True :
264
313
await asyncio .sleep (30 )
265
- block = settings .shared_settings .block
266
- if (
267
- block - settings .shared_settings .METAGRAPH .last_update [settings .shared_settings .UID ] > 500
268
- and step > 150
269
- ):
270
- last_update_block = settings .shared_settings .METAGRAPH .last_update [settings .shared_settings .UID ]
271
- logger .warning (
272
- f"Metagraph hasn't been updated for { block - last_update_block } blocks. "
273
- f"Staled block: { block } , Last update: { last_update_block } "
274
- )
275
- break
276
- step += 1
277
314
278
315
except KeyboardInterrupt :
316
+ event_stop .set ()
279
317
logger .info ("KeyboardInterrupt detected. Shutting down gracefully..." )
280
318
except Exception as e :
281
319
logger .error (f"Main loop error: { e } " )
282
320
raise
283
321
finally :
284
- logger .warning ("🚨 Force‑killing entire process‑group" )
322
+ logger .warning ("🚨 Force‑killing entire process‑group" )
285
323
286
324
# 1. Cancel in‑process tasks so they stop touching the Manager.
287
325
for t in tasks :
288
326
t .cancel ()
289
327
await asyncio .gather (* tasks , return_exceptions = True )
328
+ await asyncio .sleep (5 )
290
329
291
330
# 2. Manager cleanup *first* (so its socket vanishes).
292
331
manager .shutdown ()
293
332
294
333
# 3. Sledgehammer.
295
- if os . name == "posix" :
334
+ try :
296
335
os .killpg (0 , signal .SIGKILL )
297
- else :
298
- logger .error (f"Unsupported OS : { os . name } " )
336
+ except Exception as e :
337
+ logger .error (f"Failed to kill process group : { e } " )
299
338
sys .exit (1 )
300
339
301
340
341
+ def kill_process_group ():
342
+ try :
343
+ os .killpg (os .getpgid (0 ), signal .SIGKILL )
344
+ except Exception as e :
345
+ logger .error (f"Failed to kill process group: { e } " )
346
+
347
+
302
348
# The main function parses the configuration and runs the validator.
303
349
if __name__ == "__main__" :
350
+ try :
351
+ os .setpgrp ()
352
+ atexit .register (kill_process_group )
353
+ except BaseException :
354
+ logger .warning ("Failed to set process group; emergency termination may not work." )
304
355
asyncio .run (main ())
0 commit comments