Skip to content

Commit 1a14322

Browse files
Merge pull request #1103 from roboflow/feat/evict-models-when-cuda-memory-free-below-threshold
Memory pressure safety valve
2 parents d87dbb8 + d0131bd commit 1a14322

File tree

2 files changed

+27
-2
lines changed

2 files changed

+27
-2
lines changed

inference/core/env.py

+3
Original file line numberDiff line numberDiff line change
@@ -532,6 +532,9 @@
532532
warnings.simplefilter("ignore", ModelDependencyMissing)
533533

534534
DISK_CACHE_CLEANUP = str2bool(os.getenv("DISK_CACHE_CLEANUP", "True"))
535+
MEMORY_FREE_THRESHOLD = float(
536+
os.getenv("MEMORY_FREE_THRESHOLD", "0.0")
537+
) # percentage of free memory, 0 disables memory pressure detection
535538

536539
# Stream manager configuration
537540
try:

inference/core/managers/decorators/fixed_size_cache.py

+24-2
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from inference.core import logger
55
from inference.core.entities.requests.inference import InferenceRequest
66
from inference.core.entities.responses.inference import InferenceResponse
7-
from inference.core.env import DISK_CACHE_CLEANUP
7+
from inference.core.env import DISK_CACHE_CLEANUP, MEMORY_FREE_THRESHOLD
88
from inference.core.managers.base import Model, ModelManager
99
from inference.core.managers.decorators.base import ModelManagerDecorator
1010
from inference.core.managers.entities import ModelDescription
@@ -43,7 +43,9 @@ def add_model(
4343
return None
4444

4545
logger.debug(f"Current capacity of ModelManager: {len(self)}/{self.max_size}")
46-
while len(self) >= self.max_size:
46+
while len(self) >= self.max_size or (
47+
MEMORY_FREE_THRESHOLD and self.memory_pressure_detected()
48+
):
4749
to_remove_model_id = self._key_queue.popleft()
4850
super().remove(
4951
to_remove_model_id, delete_from_disk=DISK_CACHE_CLEANUP
@@ -141,3 +143,23 @@ def _resolve_queue_id(
141143
self, model_id: str, model_id_alias: Optional[str] = None
142144
) -> str:
143145
return model_id if model_id_alias is None else model_id_alias
146+
147+
def memory_pressure_detected(self) -> bool:
148+
return_boolean = False
149+
try:
150+
import torch
151+
152+
if torch.cuda.is_available():
153+
free_memory, total_memory = torch.cuda.mem_get_info()
154+
return_boolean = (
155+
float(free_memory / total_memory) < MEMORY_FREE_THRESHOLD
156+
)
157+
logger.debug(
158+
f"Free memory: {free_memory}, Total memory: {total_memory}, threshold: {MEMORY_FREE_THRESHOLD}, return_boolean: {return_boolean}"
159+
)
160+
# TODO: Add memory calculation for other non-CUDA devices
161+
except Exception as e:
162+
logger.error(
163+
f"Failed to check CUDA memory pressure: {e}, returning {return_boolean}"
164+
)
165+
return return_boolean

0 commit comments

Comments
 (0)