- 
                Notifications
    
You must be signed in to change notification settings  - Fork 43
 
Open
Description
Overview
Add accelerator selection capabilities to the InferenceService API, allowing users to specify accelerator preferences and constraints for their model deployments.
Scope
- Extend InferenceServiceSpec
 - Extend Component Specs
 - Add to InferenceServiceStatus
 
Files to Modify
/pkg/apis/ome/v1beta1/inference_service.go
Implementation Details
// Add to InferenceServiceSpec struct (existing fields shown for context)
type InferenceServiceSpec struct {
    Predictor PredictorSpec `json:"predictor"` // Deprecated
    Engine *EngineSpec `json:"engine,omitempty"`
    Decoder *DecoderSpec `json:"decoder,omitempty"`
    Model *ModelRef `json:"model,omitempty"`
    Runtime *ServingRuntimeRef `json:"runtime,omitempty"`
    Router *RouterSpec `json:"router,omitempty"`
    KedaConfig *KedaConfig `json:"kedaConfig,omitempty"`
    
    // AcceleratorSelector specifies accelerator selection preferences
    // +optional
    AcceleratorSelector *AcceleratorSelector `json:"acceleratorSelector,omitempty"`
}
// AcceleratorSelector defines how to select accelerators for the InferenceService
type AcceleratorSelector struct {
    // AcceleratorClass explicitly selects a specific AcceleratorClass
    // Takes precedence over other selectors
    // +optional
    AcceleratorClass *string `json:"acceleratorClass,omitempty"`
    
    // Constraints defines requirements that accelerators must meet
    // +optional
    Constraints *AcceleratorConstraints `json:"constraints,omitempty"`
    
    // Policy defines the selection policy when multiple accelerators match
    // +kubebuilder:validation:Enum=BestFit;Cheapest;MostCapable;FirstAvailable
    // +kubebuilder:default=BestFit
    // +optional
    Policy AcceleratorSelectionPolicy `json:"policy,omitempty"`
}
// AcceleratorConstraints defines requirements for accelerator selection
type AcceleratorConstraints struct {
    // MinMemory in GB
    // +optional
    MinMemory *int `json:"minMemory,omitempty"`
    
    // MaxMemory in GB (useful for cost control)
    // +optional
    MaxMemory *int `json:"maxMemory,omitempty"`
    
    // MinComputeCapability in TFLOPS
    // +optional
    MinComputeCapability *float64 `json:"minComputeCapability,omitempty"`
    
    // RequiredFeatures that must be present
    // +optional
    RequiredFeatures []string `json:"requiredFeatures,omitempty"`
    
    // ExcludedClasses lists AcceleratorClasses to avoid
    // +optional
    ExcludedClasses []string `json:"excludedClasses,omitempty"`
    
    // ArchitectureFamilies limits selection to specific families
    // Examples: ["nvidia-hopper", "nvidia-ampere"]
    // +optional
    ArchitectureFamilies []string `json:"architectureFamilies,omitempty"`
}
// AcceleratorSelectionPolicy defines how to select among matching accelerators
type AcceleratorSelectionPolicy string
const (
    // BestFit selects the accelerator that best matches model requirements
    BestFitPolicy AcceleratorSelectionPolicy = "BestFit"
    
    // Cheapest selects the lowest cost accelerator that meets requirements
    CheapestPolicy AcceleratorSelectionPolicy = "Cheapest"
    
    // MostCapable selects the most powerful accelerator available
    MostCapablePolicy AcceleratorSelectionPolicy = "MostCapable"
    
    // FirstAvailable selects the first matching accelerator (fastest scheduling)
    FirstAvailablePolicy AcceleratorSelectionPolicy = "FirstAvailable"
)
2. Extend Component Specs
Since OME uses engine/decoder/router architecture, each component might have different accelerator needs:
// Add to EngineSpec in inference_service.go (existing fields shown for context)
type EngineSpec struct {
    PodSpec `json:",inline"`
    ComponentExtensionSpec `json:",inline"`
    Runner *RunnerSpec `json:"runner,omitempty"`
    Leader *LeaderSpec `json:"leader,omitempty"`
    Worker *WorkerSpec `json:"worker,omitempty"`
    
    // AcceleratorOverride allows overriding the global accelerator selection for this component
    // +optional
    AcceleratorOverride *AcceleratorSelector `json:"acceleratorOverride,omitempty"`
}
// Add to DecoderSpec (existing fields shown for context)
type DecoderSpec struct {
    PodSpec `json:",inline"`
    ComponentExtensionSpec `json:",inline"`
    Runner *RunnerSpec `json:"runner,omitempty"`
    Leader *LeaderSpec `json:"leader,omitempty"`
    Worker *WorkerSpec `json:"worker,omitempty"`
    
    // AcceleratorOverride allows overriding the global accelerator selection for this component
    // +optional
    AcceleratorOverride *AcceleratorSelector `json:"acceleratorOverride,omitempty"`
}
// Note: RouterSpec typically doesn't need GPU, so no override needed3. Add to InferenceServiceStatus
Update status to show which accelerator was selected:
// Add to ComponentStatusSpec in inference_service_status.go
type ComponentStatusSpec struct {
    // ... existing fields ...
    
    // SelectedAccelerator shows which AcceleratorClass was selected
    // +optional
    SelectedAccelerator *AcceleratorSelection `json:"selectedAccelerator,omitempty"`
}
// AcceleratorSelection shows what accelerator was selected and why
type AcceleratorSelection struct {
    // AcceleratorClass that was selected
    AcceleratorClass string `json:"acceleratorClass"`
    
    // Reason explains why this accelerator was selected
    // +optional
    Reason string `json:"reason,omitempty"`
    
    // NodeSelector that was applied to pods
    // +optional
    NodeSelector map[string]string `json:"nodeSelector,omitempty"`
    
    // ResourceRequests that were applied to pods
    // +optional
    ResourceRequests map[string]string `json:"resourceRequests,omitempty"`
}Acceptance Criteria
- Extend inferenServiceSpec
 - Extend Component spec
 - Update inferenceService Status
 - Api type tests pass
 
Dependencies
task1 - Create The CRD
Estimated Effort
2-3 hours
Metadata
Metadata
Assignees
Labels
No labels