Kill processes properly; Minimize subtensor calls

dbobrenko · dbobrenko · commit 58e664a5124b · 2025-04-16T20:05:45.000Z
diff --git a/neurons/validator.py b/neurons/validator.py
@@ -1,4 +1,6 @@
 import asyncio
+import os
+import signal
 import sys
 from multiprocessing.managers import AcquirerProxy
 
@@ -36,7 +38,7 @@ async def create_loop_process(
     reward_events: list,
     miners_dict: dict,
     mp_lock: AcquirerProxy,
-) -> None:
+):
     # Load settings and initialize external services.
     settings.shared_settings = settings.SharedSettings.load(mode="validator")
     if settings.shared_settings.WANDB_ON:
@@ -45,8 +47,10 @@ async def create_loop_process(
     # A list to keep references to all the tasks we spawn, so they can be cancelled later.
     all_tasks: list[asyncio.Task] = []
 
-    async def cleanup():
+    async def cleanup(model_scheduler):
         logger.info("Cleaning up resources...")
+        torch.distributed.destroy_process_group()
+        await model_scheduler.llm_model.cleanup()
         for t in all_tasks:
             t.cancel()
         await asyncio.gather(*all_tasks, return_exceptions=True)
@@ -88,12 +92,12 @@ async def spawn_loops(task_queue: list, scoring_queue: list, reward_events: list
         await spawn_loops(task_queue, scoring_queue, reward_events, miners_dict)
     except MemoryError as e:
         logger.error(f"MemoryError encountered. Terminating program: {e}")
-        await cleanup()
+        await cleanup(model_scheduler)
         sys.exit(1)
     except Exception as e:
         logger.exception(f"Terminating loop process: {e}")
     finally:
-        await cleanup()
+        await cleanup(model_scheduler)
 
 
 def start_api(
@@ -260,10 +264,10 @@ async def main(
             step = 0
             while True:
                 await asyncio.sleep(30)
-                block = settings.shared_settings.SUBTENSOR.get_current_block()
+                block = settings.shared_settings.block
                 if (
                     block - settings.shared_settings.METAGRAPH.last_update[settings.shared_settings.UID] > 500
-                    and step > 120
+                    and step > 150
                 ):
                     last_update_block = settings.shared_settings.METAGRAPH.last_update[settings.shared_settings.UID]
                     logger.warning(
@@ -279,17 +283,27 @@ async def main(
             logger.error(f"Main loop error: {e}")
             raise
         finally:
+            logger.warning("🚨  Force‑killing entire process‑group")
+
+            # 1. Cancel in‑process tasks so they stop touching the Manager.
             for t in tasks:
                 t.cancel()
-            await asyncio.gather(*tasks)
+            await asyncio.gather(*tasks, return_exceptions=True)
+
+            # 2. Manager cleanup *first* (so its socket vanishes).
+            manager.shutdown()
 
-            for process in processes:
-                if process.is_alive():
-                    process.terminate()
-                    process.join()
-            sys.exit(1)
+            # 3. Sledgehammer.
+            if os.name == "posix":
+                os.killpg(0, signal.SIGKILL)
+            else:
+                logger.error(f"Unsupported OS: {os.name}")
+    sys.exit(1)
 
 
 # The main function parses the configuration and runs the validator.
 if __name__ == "__main__":
+    if os.name == "posix":
+        # Become the leader of a new process group.
+        os.setpgrp()
     asyncio.run(main())
diff --git a/prompting/datasets/huggingface_github.py b/prompting/datasets/huggingface_github.py
@@ -121,7 +121,7 @@ def next(self) -> HuggingFaceGithubDatasetEntry | None:
                 return self._process_entry(entry)
             except BaseException as e:
                 logger.debug(f"Failed to sample from shard, skipping: {e}")
-        raise ValueError(f"Failed to get sample from shard after {RETRIES} retries")
+        raise ValueError(f"Failed to get sample from shard after {RETRIES} retries.")
 
     def get(self) -> HuggingFaceGithubDatasetEntry:
         return self.next()
diff --git a/prompting/llms/model_manager.py b/prompting/llms/model_manager.py
@@ -1,4 +1,5 @@
 import asyncio
+from functools import partial
 import gc
 from multiprocessing.managers import AcquirerProxy
 from typing import ClassVar
@@ -81,19 +82,22 @@ async def load_model(self, model_config: ModelConfig, force: bool = True) -> Rep
                     logger.debug(f"Unloading {active_model.llm_model_id} to make room for {model_config.llm_model_id}")
 
                     await self._unload_model(active_model)
-                await self._vram_cleanup()
+                await self.cleanup()
 
             retries_max = 1
             retry_counter = 0
             retry_delay = 15
             while True:
                 try:
                     GPUInfo.log_gpu_info()
-                    model = model_factory(model_config.llm_model_id)(
+                    # Wrap blocking model loading into thread.
+                    loader = partial(
+                        model_factory(model_config.llm_model_id),
                         model_id=model_config.llm_model_id,
                         device=settings.shared_settings.NEURON_DEVICE,
                         sampling_params=settings.shared_settings.SAMPLING_PARAMS,
                     )
+                    model: ReproducibleVLLM = await asyncio.to_thread(loader)
                     self.used_ram += model_config.min_ram
                     logger.info(
                         f"Model {model_config.llm_model_id} has been successfully loaded. "
@@ -105,13 +109,13 @@ async def load_model(self, model_config: ModelConfig, force: bool = True) -> Rep
                 except BaseException as e:
                     if retry_counter > retries_max:
                         logger.error(f"Failed to load model after {retries_max} retries. Terminating process")
-                        await self._vram_cleanup()
+                        await self.cleanup()
                         # In case of VRAM leak, raise an exception to terminate the process.
                         raise MemoryError
 
                     retry_counter += 1
                     retry_delay += retry_counter
-                    await self._vram_cleanup()
+                    await self.cleanup()
                     logger.error(
                         f"Failed to load model {model_config.llm_model_id}. Retrying in {retry_delay} seconds. "
                         f"Error: {str(e)}"
@@ -150,7 +154,7 @@ async def _unload_model(self, model_config: ModelConfig):
             logger.debug(f"Initial free GPU memory before unloading: {initial_free_memory} GB")
 
             await self._cleanup_model(model_instance, cpu_offload=False)
-            await self._vram_cleanup()
+            await self.cleanup()
 
             memory_freed = GPUInfo.free_memory - initial_free_memory
             logger.info(f"Successfully unloaded model {model_config.llm_model_id}. Memory freed: {memory_freed:.2f} GB")
@@ -219,7 +223,7 @@ async def generate_logits(
             continue_last_message=continue_last_message,
         )
 
-    async def _vram_cleanup(self):
+    async def cleanup(self):
         """Perform VRAM clean-up."""
         for _, model in self.active_models.items():
             del model.model
diff --git a/prompting/llms/vllm_llm.py b/prompting/llms/vllm_llm.py
@@ -209,6 +209,9 @@ def unload_model(self):
             if torch.cuda.is_available():
                 torch.cuda.empty_cache()
 
+    def __del__(self):
+        self.unload_model()
+
     @staticmethod
     def format_messages(messages: list[str] | list[dict[str, str]]) -> list[dict[str, str | list[dict[str, str]]]]:
         return messages
diff --git a/prompting/tasks/task_sending.py b/prompting/tasks/task_sending.py
@@ -6,7 +6,6 @@
 
 from prompting.miner_availability.miner_availability import MinerAvailabilities
 
-# from prompting.rewards.scoring import task_scorer
 from prompting.rewards.scoring_config import ScoringConfig
 from prompting.tasks.base_task import BaseTextTask
 from prompting.tasks.inference import InferenceTask
@@ -77,9 +76,6 @@ async def collect_responses(task: BaseTextTask, miners_dict: dict) -> DendriteRe
 class TaskSender(AsyncLoopRunner):
     interval: int = 10
     _lock: asyncio.Lock = asyncio.Lock()
-    block_sync_last_time: float = 0
-    block_sync_interval: float = 300
-
     task_queue: list | None = None
     scoring_queue: list | None = None
     miners_dict: dict | None = None
@@ -93,17 +89,6 @@ async def start(self, task_queue, scoring_queue, miners_dict, **kwargs):
         self.miners_dict = miners_dict
         return await super().start(**kwargs)
 
-    @property
-    def block(self) -> int:
-        time_since_last_block = time.time() - self.block_sync_last_time
-        if time_since_last_block > self.block_sync_interval:
-            self._block = shared_settings.SUBTENSOR.get_current_block()
-            self.block_sync_last_time = time.time()
-            return self._block
-
-        blocks_passed = time_since_last_block // 12
-        return self._block + blocks_passed
-
     async def run_step(self) -> ValidatorLoggingEvent | ErrorLoggingEvent | None:
         logger.info("Checking for tasks to be sent...")
         while len(self.scoring_queue) > shared_settings.SCORING_QUEUE_LENGTH_THRESHOLD:
@@ -124,15 +109,15 @@ async def run_step(self) -> ValidatorLoggingEvent | ErrorLoggingEvent | None:
                 task=task,
                 response=response_event,
                 dataset_entry=task.dataset_entry,
-                block=self.block,
+                block=shared_settings.block,
                 step=self.step,
                 task_id=task.task_id,
             )
             self.scoring_queue.append(scoring_config)
 
             # Log the step event.
             return ValidatorLoggingEvent(
-                block=self.block,
+                block=shared_settings.block,
                 step=self.step,
                 step_time=timer.final_time,
                 response_event=response_event,
diff --git a/shared/settings.py b/shared/settings.py
@@ -16,6 +16,7 @@
 import bittensor as bt
 import dotenv
 from bittensor.core.metagraph import Metagraph
+from bittensor.core.subtensor import Subtensor
 from loguru import logger
 from pydantic import Field, model_validator
 from pydantic_settings import BaseSettings
@@ -32,6 +33,9 @@ class SharedSettings(BaseSettings):
     _instance_mode: Optional[str] = None
     _last_metagraph: Metagraph = None
     _last_update_time: float = 0
+    _block_sync_last_time: float = 0
+    _block_sync_interval: float = 300
+    _subtensor: Subtensor | None = None
 
     mode: Literal["api", "validator", "miner", "mock"] = Field("validator", env="MODE")
     MOCK: bool = False
@@ -258,16 +262,20 @@ def WALLET(self):
         return bt.wallet(name=wallet_name, hotkey=hotkey)
 
     @cached_property
-    def SUBTENSOR(self) -> bt.subtensor:
+    def SUBTENSOR(self) -> Subtensor:
+        """Lazy subtensor initialization."""
+        if self._subtensor is not None:
+            return self._subtensor
         # TODO: Move chain-related stuff out of settings.
         subtensor_network = self.SUBTENSOR_NETWORK or os.environ.get("SUBTENSOR_NETWORK", "local")
         # bt_config = config()
         if subtensor_network.lower() == "local":
             subtensor_network = os.environ.get("SUBTENSOR_CHAIN_ENDPOINT")  # bt_config.subtensor.chain_endpoint or
         else:
-            subtensor_network = subtensor_network.lower()  # bt_config.subtensor.network or
+            subtensor_network = subtensor_network.lower()
         logger.info(f"Instantiating subtensor with network: {subtensor_network}")
-        return bt.subtensor(network=subtensor_network)
+        self._subtensor = Subtensor(network=subtensor_network)
+        return self._subtensor
 
     @property
     def METAGRAPH(self) -> Metagraph:
@@ -294,11 +302,17 @@ def UID(self) -> int:
         # TODO: Move chain-related stuff out of settings.
         return self.METAGRAPH.hotkeys.index(self.WALLET.hotkey.ss58_address)
 
-    @cached_property
-    def DENDRITE(self) -> bt.dendrite:
+    @property
+    def block(self) -> int:
         # TODO: Move chain-related stuff out of settings.
-        logger.info(f"Instantiating dendrite with wallet: {self.WALLET}")
-        return bt.dendrite(wallet=self.WALLET)
+        time_since_last_block = time.time() - self._block_sync_last_time
+        if time_since_last_block > self._block_sync_interval:
+            self._block = self.SUBTENSOR.get_current_block()
+            self._block_sync_last_time = time.time()
+            return self._block
+
+        blocks_passed = time_since_last_block // 12
+        return self._block + blocks_passed
 
 
 try: