1818"""
1919
2020import importlib .metadata
21+ import json
2122import logging
22- from typing import Dict
23+ import os
24+ from typing import Dict , Optional
2325
2426import pynvml
27+ from kubernetes import client , config
2528
2629logger = logging .getLogger (__name__ )
2730
2831
2932# VLLM process manager
3033class GpuTranslator :
31- def __init__ (self , mock_gpus : bool = False , mock_gpu_count : int = 8 ):
34+ def __init__ (self , mock_gpus : bool = False , mock_gpu_count : int = 8 , node_name : Optional [ str ] = None ):
3235 """
3336 Initialize GPU Translator
3437
3538 Args:
3639 mock_gpus: If True, skip pynvml and use mock mode for testing
3740 mock_gpu_count: Number of mock GPUs to create (default: 8)
41+ node_name: Kubernetes node name for ConfigMap-based GPU discovery
3842 """
3943 self .mapping = {}
4044 self .reverse_mapping = {}
4145 self .device_count = 0
4246 self .mock_mode = mock_gpus
4347 self .mock_gpu_count = mock_gpu_count
48+ self .node_name = node_name or os .getenv ("NODE_NAME" )
4449 if not self .mock_mode :
4550 self ._check_library ()
4651 self ._populate_mapping ()
@@ -62,11 +67,79 @@ def _check_library(self):
6267 f"package { package_name } not found. Please install it."
6368 )
6469
70+ def _load_gpu_map_from_configmap (self ) -> Optional [Dict [str , int ]]:
71+ """
72+ Load GPU mapping from Kubernetes ConfigMap 'gpu-map'.
73+
74+ Returns:
75+ Dict[str, int]: GPU UUID to index mapping, or None if ConfigMap not available
76+ """
77+ if not self .node_name :
78+ logger .info ("No node name provided, skipping ConfigMap GPU discovery" )
79+ return None
80+
81+ try :
82+ # Try to load in-cluster config first, fall back to kubeconfig
83+ try :
84+ config .load_incluster_config ()
85+ except config .ConfigException :
86+ config .load_kube_config ()
87+
88+ v1 = client .CoreV1Api ()
89+
90+ # Read the ConfigMap
91+ namespace = os .getenv ("NAMESPACE" , "default" )
92+ cm = v1 .read_namespaced_config_map (name = "gpu-map" , namespace = namespace )
93+
94+ if not cm .data or self .node_name not in cm .data :
95+ logger .warning (
96+ "Node '%s' not found in ConfigMap 'gpu-map' in namespace '%s'" ,
97+ self .node_name ,
98+ namespace
99+ )
100+ return None
101+
102+ # Parse the JSON mapping for this node
103+ node_gpu_data = cm .data [self .node_name ]
104+ gpu_mapping = json .loads (node_gpu_data )
105+
106+ logger .info (
107+ "Loaded GPU mapping from ConfigMap for node '%s': %s" ,
108+ self .node_name ,
109+ gpu_mapping
110+ )
111+ return gpu_mapping
112+
113+ except Exception as e :
114+ logger .warning (
115+ "Failed to load GPU mapping from ConfigMap: %s. Falling back to mock mode." ,
116+ e
117+ )
118+ return None
119+
65120 def _populate_mapping (self ):
66121 """
67122 Creates mapping and reverse_mapping for the GPU Translator.
68- In mock mode, pre-populates with mock GPU UUIDs following the pattern GPU-{index}.
123+ Priority order:
124+ 1. ConfigMap 'gpu-map' (if in Kubernetes and node_name available)
125+ 2. Mock mode (if mock_gpus=True)
126+ 3. Real GPUs via pynvml
69127 """
128+ # Try ConfigMap first if in mock mode and node_name is available
129+ if self .mock_mode and self .node_name :
130+ configmap_mapping = self ._load_gpu_map_from_configmap ()
131+ if configmap_mapping :
132+ self .mapping = configmap_mapping
133+ self .reverse_mapping = {v : k for k , v in self .mapping .items ()}
134+ self .device_count = len (self .mapping )
135+ logger .info (
136+ "GPU Translator initialized from ConfigMap with %d GPUs for node '%s'" ,
137+ self .device_count ,
138+ self .node_name
139+ )
140+ return
141+
142+ # Fall back to hardcoded mock mode
70143 if self .mock_mode :
71144 # Pre-populate with mock GPUs following the test pattern: GPU-0, GPU-1, etc.
72145 for index in range (self .mock_gpu_count ):
@@ -77,6 +150,7 @@ def _populate_mapping(self):
77150 logger .info ("GPU Translator initialized in mock mode with %d mock GPUs" , self .mock_gpu_count )
78151 return
79152
153+ # Use real GPUs via pynvml
80154 try :
81155 pynvml .nvmlInit ()
82156 self .device_count = pynvml .nvmlDeviceGetCount ()
0 commit comments