diff --git a/inference/core/env.py b/inference/core/env.py index ded1c02d7..542a37995 100644 --- a/inference/core/env.py +++ b/inference/core/env.py @@ -532,6 +532,9 @@ warnings.simplefilter("ignore", ModelDependencyMissing) DISK_CACHE_CLEANUP = str2bool(os.getenv("DISK_CACHE_CLEANUP", "True")) +MEMORY_FREE_THRESHOLD = float( + os.getenv("MEMORY_FREE_THRESHOLD", "0.0") +) # percentage of free memory, 0 disables memory pressure detection # Stream manager configuration try: diff --git a/inference/core/managers/decorators/fixed_size_cache.py b/inference/core/managers/decorators/fixed_size_cache.py index e2dc4eafc..21b91a48a 100644 --- a/inference/core/managers/decorators/fixed_size_cache.py +++ b/inference/core/managers/decorators/fixed_size_cache.py @@ -4,7 +4,7 @@ from inference.core import logger from inference.core.entities.requests.inference import InferenceRequest from inference.core.entities.responses.inference import InferenceResponse -from inference.core.env import DISK_CACHE_CLEANUP +from inference.core.env import DISK_CACHE_CLEANUP, MEMORY_FREE_THRESHOLD from inference.core.managers.base import Model, ModelManager from inference.core.managers.decorators.base import ModelManagerDecorator from inference.core.managers.entities import ModelDescription @@ -43,7 +43,9 @@ def add_model( return None logger.debug(f"Current capacity of ModelManager: {len(self)}/{self.max_size}") - while len(self) >= self.max_size: + while len(self) >= self.max_size or ( + MEMORY_FREE_THRESHOLD and self.memory_pressure_detected() + ): to_remove_model_id = self._key_queue.popleft() super().remove( to_remove_model_id, delete_from_disk=DISK_CACHE_CLEANUP @@ -141,3 +143,23 @@ def _resolve_queue_id( self, model_id: str, model_id_alias: Optional[str] = None ) -> str: return model_id if model_id_alias is None else model_id_alias + + def memory_pressure_detected(self) -> bool: + return_boolean = False + try: + import torch + + if torch.cuda.is_available(): + free_memory, total_memory = torch.cuda.mem_get_info() + return_boolean = ( + float(free_memory / total_memory) < MEMORY_FREE_THRESHOLD + ) + logger.debug( + f"Free memory: {free_memory}, Total memory: {total_memory}, threshold: {MEMORY_FREE_THRESHOLD}, return_boolean: {return_boolean}" + ) + # TODO: Add memory calculation for other non-CUDA devices + except Exception as e: + logger.error( + f"Failed to check CUDA memory pressure: {e}, returning {return_boolean}" + ) + return return_boolean