chore: fix more Device spec

vividf · vividf · commit 2b28f603e6ab · 2026-03-11T13:24:10.000+09:00
Signed-off-by: vividf &lt;yihsiang.fang@tier4.jp&gt;
diff --git a/deployment/configs/schema.py b/deployment/configs/schema.py
@@ -17,6 +17,7 @@
     PrecisionPolicy,
 )
 from deployment.core.backend import Backend
+from deployment.core.device import DeviceSpec
 from deployment.exporters.common.configs import TensorRTProfileConfig
 
 
@@ -72,8 +73,8 @@ def should_export_tensorrt(self) -> bool:
 class DeviceConfig:
     """Normalized device settings shared across deployment stages."""
 
-    cpu: str = "cpu"
-    cuda: Optional[str] = "cuda:0"
+    cpu: DeviceSpec = field(default_factory=lambda: DeviceSpec.from_value("cpu"))
+    cuda: Optional[DeviceSpec] = field(default_factory=lambda: DeviceSpec.from_value("cuda:0"))
 
     def __post_init__(self) -> None:
         object.__setattr__(self, "cpu", self._normalize_cpu(self.cpu))
@@ -82,47 +83,32 @@ def __post_init__(self) -> None:
     @classmethod
     def from_dict(cls, config_dict: Mapping[str, Any]) -> DeviceConfig:
         """Create DeviceConfig from dict."""
-        return cls(cpu=config_dict.get("cpu", cls.cpu), cuda=config_dict.get("cuda", cls.cuda))
+        return cls(cpu=config_dict.get("cpu", "cpu"), cuda=config_dict.get("cuda", "cuda:0"))
 
     @staticmethod
-    def _normalize_cpu(device: Optional[str]) -> str:
-        """Normalize CPU device string."""
-        if not device:
-            return "cpu"
-        normalized = str(device).strip().lower()
-        if normalized.startswith("cuda"):
+    def _normalize_cpu(device: Any) -> DeviceSpec:
+        """Normalize CPU device."""
+        normalized = DeviceSpec.from_value(device if device is not None else "cpu")
+        if normalized.is_cuda:
             raise ValueError("CPU device cannot be a CUDA device")
         return normalized
 
     @staticmethod
-    def _normalize_cuda(device: Optional[str]) -> Optional[str]:
-        """Normalize CUDA device string to 'cuda:N' format."""
+    def _normalize_cuda(device: Any) -> Optional[DeviceSpec]:
+        """Normalize CUDA device to DeviceSpec."""
         if device is None:
             return None
-        if not isinstance(device, str):
-            raise ValueError("cuda device must be a string (e.g., 'cuda:0')")
-        normalized = device.strip().lower()
-        if normalized == "":
-            return None
-        if normalized == "cuda":
-            normalized = "cuda:0"
-        if not normalized.startswith("cuda"):
-            raise ValueError(f"Invalid CUDA device '{device}'. Must start with 'cuda'")
-        suffix = normalized.split(":", 1)[1] if ":" in normalized else "0"
-        suffix = suffix.strip() or "0"
-        if not suffix.isdigit():
-            raise ValueError(f"Invalid CUDA device index in '{device}'")
-        device_id = int(suffix)
-        if device_id < 0:
-            raise ValueError("CUDA device index must be non-negative")
-        return f"cuda:{device_id}"
+        normalized = DeviceSpec.from_value(device)
+        if not normalized.is_cuda:
+            raise ValueError(f"Invalid CUDA device '{device}'.")
+        return normalized
 
     @property
     def cuda_device_index(self) -> Optional[int]:
         """Return CUDA device index as integer (if configured)."""
         if self.cuda is None:
             return None
-        return int(self.cuda.split(":", 1)[1])
+        return self.cuda.index
 
 
 @dataclass(frozen=True)
@@ -360,7 +346,7 @@ class EvaluationConfig:
     verbose: bool = False
     backends: Mapping[Any, Mapping[str, Any]] = field(default_factory=_empty_mapping)
     models: Mapping[Any, Any] = field(default_factory=_empty_mapping)
-    devices: Mapping[str, str] = field(default_factory=_empty_mapping)
+    devices: Mapping[str, DeviceSpec] = field(default_factory=_empty_mapping)
 
     @classmethod
     def from_dict(cls, config_dict: Mapping[str, Any]) -> EvaluationConfig:
@@ -383,13 +369,15 @@ def from_dict(cls, config_dict: Mapping[str, Any]) -> EvaluationConfig:
         if not isinstance(devices_raw, Mapping):
             raise TypeError(f"evaluation.devices must be a mapping, got {type(devices_raw).__name__}")
 
+        normalized_devices = {str(key): DeviceSpec.from_value(value) for key, value in devices_raw.items()}
+
         return cls(
             enabled=config_dict.get("enabled", False),
             num_samples=config_dict.get("num_samples", 10),
             verbose=config_dict.get("verbose", False),
             backends=MappingProxyType(backends_frozen),
             models=MappingProxyType(dict(models_raw)),
-            devices=MappingProxyType(dict(devices_raw)),
+            devices=MappingProxyType(normalized_devices),
         )
 
 
@@ -398,9 +386,9 @@ class VerificationScenario:
     """Immutable verification scenario specification."""
 
     ref_backend: Backend
-    ref_device: str
+    ref_device: DeviceSpec
     test_backend: Backend
-    test_device: str
+    test_device: DeviceSpec
 
     @classmethod
     def from_dict(cls, data: Mapping[str, Any]) -> VerificationScenario:
@@ -410,9 +398,9 @@ def from_dict(cls, data: Mapping[str, Any]) -> VerificationScenario:
 
         return cls(
             ref_backend=Backend.from_value(data["ref_backend"]),
-            ref_device=str(data["ref_device"]),
+            ref_device=DeviceSpec.from_value(data["ref_device"]),
             test_backend=Backend.from_value(data["test_backend"]),
-            test_device=str(data["test_device"]),
+            test_device=DeviceSpec.from_value(data["test_device"]),
         )
 
 
@@ -423,7 +411,7 @@ class VerificationConfig:
     enabled: bool = True
     num_verify_samples: int = 3
     tolerance: float = 0.1
-    devices: Mapping[str, str] = field(default_factory=_empty_mapping)
+    devices: Mapping[str, DeviceSpec] = field(default_factory=_empty_mapping)
     scenarios: Mapping[ExportMode, Tuple[VerificationScenario, ...]] = field(default_factory=_empty_mapping)
 
     @classmethod
@@ -452,11 +440,13 @@ def from_dict(cls, config_dict: Mapping[str, Any]) -> VerificationConfig:
         if not isinstance(devices_raw, Mapping):
             raise TypeError(f"verification.devices must be a mapping, got {type(devices_raw).__name__}")
 
+        normalized_devices = {str(key): DeviceSpec.from_value(value) for key, value in devices_raw.items()}
+
         return cls(
             enabled=config_dict.get("enabled", True),
             num_verify_samples=config_dict.get("num_verify_samples", 3),
             tolerance=config_dict.get("tolerance", 0.1),
-            devices=MappingProxyType(dict(devices_raw)),
+            devices=MappingProxyType(normalized_devices),
             scenarios=MappingProxyType(scenario_map),
         )
 
diff --git a/deployment/exporters/export_pipelines/base.py b/deployment/exporters/export_pipelines/base.py
@@ -9,6 +9,7 @@
 
 from deployment.configs import BaseDeploymentConfig
 from deployment.core.artifacts import Artifact
+from deployment.core.device import DeviceSpec
 from deployment.core.io.base_data_loader import BaseDataLoader
 
 
@@ -54,7 +55,7 @@ def export(
         onnx_path: str,
         output_dir: str,
         config: BaseDeploymentConfig,
-        device: str,
+        device: DeviceSpec,
     ) -> Artifact:
         """
         Execute the TensorRT export pipeline and return the produced artifact.
@@ -63,7 +64,7 @@ def export(
             onnx_path: Path to ONNX model file/directory
             output_dir: Directory for output files
             config: Deployment configuration
-            device: CUDA device string
+            device: CUDA device specification
 
         Returns:
             Artifact describing the exported TensorRT output
diff --git a/deployment/runtime/evaluation_orchestrator.py b/deployment/runtime/evaluation_orchestrator.py
@@ -128,7 +128,7 @@ def _get_models_to_evaluate(self, artifact_manager: ArtifactManager) -> List[Mod
             if not backend_cfg.get("enabled", False):
                 continue
 
-            raw_device = backend_cfg.get("device") or str(self._get_default_device(backend_enum))
+            raw_device = backend_cfg.get("device") or self._get_default_device(backend_enum)
             device = DeviceSpec.from_value(raw_device)
             artifact, is_valid = artifact_manager.resolve_artifact(backend_enum)
 
@@ -175,8 +175,10 @@ def _get_default_device(self, backend: Backend) -> DeviceSpec:
             Default device string
         """
         if backend is Backend.TENSORRT:
-            return DeviceSpec.from_value(self.config.devices.cuda or "cuda:0")
-        return DeviceSpec.from_value(self.config.devices.cpu or "cpu")
+            if self.config.devices.cuda is None:
+                raise RuntimeError("TensorRT backend requires a configured CUDA device.")
+            return self.config.devices.cuda
+        return self.config.devices.cpu
 
     def _print_cross_backend_comparison(self, all_results: Mapping[str, Any]) -> None:
         """
diff --git a/deployment/runtime/export_orchestrator.py b/deployment/runtime/export_orchestrator.py
@@ -400,9 +400,9 @@ def _export_tensorrt(self, onnx_path: str, context: ExportContext) -> Optional[A
         os.makedirs(tensorrt_dir, exist_ok=True)
 
         cuda_device = self.config.devices.cuda
-        device_id = self.config.devices.cuda_device_index
-        if cuda_device is None or device_id is None:
+        if cuda_device is None:
             raise RuntimeError("TensorRT export requires a CUDA device. Set deploy_cfg.devices['cuda'].")
+        device_id = cuda_device.index
         torch.cuda.set_device(device_id)
         self.logger.info(f"Using CUDA device for TensorRT export: {cuda_device}")
 
diff --git a/deployment/runtime/verification_orchestrator.py b/deployment/runtime/verification_orchestrator.py
@@ -7,11 +7,10 @@
 from __future__ import annotations
 
 import logging
-from typing import Any, Dict, Mapping
+from typing import Any, Dict
 
 from deployment.configs import BaseDeploymentConfig
 from deployment.core.backend import Backend
-from deployment.core.device import DeviceSpec
 from deployment.core.evaluation.base_evaluator import BaseEvaluator
 from deployment.core.evaluation.evaluator_types import ModelSpec
 from deployment.core.io.base_data_loader import BaseDataLoader
@@ -85,16 +84,6 @@ def run(self, artifact_manager: ArtifactManager) -> Dict[str, Any]:
 
         num_verify_samples = verification_cfg.num_verify_samples
         tolerance = verification_cfg.tolerance
-        devices_raw = verification_cfg.devices
-        if devices_raw is None:
-            devices_raw = {}
-        if not isinstance(devices_raw, Mapping):
-            raise TypeError(f"verification.devices must be a mapping, got {type(devices_raw).__name__}")
-        devices_map = dict(devices_raw)
-        devices_map.setdefault("cpu", self.config.devices.cpu or "cpu")
-        if self.config.devices.cuda:
-            devices_map.setdefault("cuda", self.config.devices.cuda)
-
         self.logger.info("=" * 80)
         self.logger.info(f"Running Verification (mode: {export_mode.value})")
         self.logger.info("=" * 80)
@@ -104,8 +93,8 @@ def run(self, artifact_manager: ArtifactManager) -> Dict[str, Any]:
         total_failed = 0
 
         for i, policy in enumerate(scenarios):
-            ref_device = self._resolve_device(policy.ref_device, devices_map)
-            test_device = self._resolve_device(policy.test_device, devices_map)
+            ref_device = policy.ref_device
+            test_device = policy.test_device
 
             self.logger.info(
                 f"\nScenario {i+1}/{len(scenarios)}: "
@@ -164,18 +153,3 @@ def run(self, artifact_manager: ArtifactManager) -> Dict[str, Any]:
         }
 
         return all_results
-
-    def _resolve_device(self, device_key: str, devices_map: Mapping[str, str]) -> DeviceSpec:
-        """
-        Resolve a device key to a full device string.
-
-        Args:
-            device_key: Device key to resolve
-            devices_map: Mapping of device keys to full device strings
-        Returns:
-            Resolved device
-        """
-        if device_key in devices_map:
-            return DeviceSpec.from_value(devices_map[device_key])
-        self.logger.warning(f"Device alias '{device_key}' not found in devices map, using as-is")
-        return DeviceSpec.from_value(device_key)