Skip to content
Draft
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
265ffb2
Add Program Aware plugin initial implementation
praveingk Feb 25, 2026
0cedc28
Fix timestamp to use enqueue time
praveingk Feb 25, 2026
f1737e5
Add program aware plugin deployment
praveingk Feb 26, 2026
f83bd15
Add metrics to monitor programs
praveingk Feb 26, 2026
bdc356f
Add instructions to deploy and test
praveingk Feb 27, 2026
3e4cfee
Add EWMA for wait time
praveingk Feb 27, 2026
c58f125
Add scoring tests
praveingk Mar 2, 2026
ab4d182
Add minor tweaks to docs
praveingk Mar 2, 2026
ffabe70
Add pick latency to metrics
praveingk Mar 3, 2026
3894397
Add fairness test
praveingk Mar 3, 2026
5e8f3e1
Add optional vllm simulator tuning flags to deployment guide
D-Sai-Venkatesh Mar 5, 2026
42b042d
Remove ResponseReceived hook and add nil-request guard to ResponseCom…
D-Sai-Venkatesh Mar 6, 2026
87f5232
Remove test scripts
praveingk Mar 10, 2026
dc11a5f
Fix starvation for lighter programs
praveingk Mar 11, 2026
9ec35d0
Add DRR and modular strategies
praveingk Mar 11, 2026
6b202e2
Remove deployment-specific files
praveingk Mar 11, 2026
4c80b9d
Fix linter errors
praveingk Mar 11, 2026
a3e1bd4
Add fairnessIndex metric
praveingk Mar 17, 2026
606a5b7
Add dynamic normalization and tweak strategy
praveingk Mar 24, 2026
68bfa81
Modify fairness index calculation based on total wait time
praveingk Mar 24, 2026
f7d2665
Update ewma score to use tokens
praveingk Mar 25, 2026
635d400
Add EWMA wait time gauge and fix default-flow metrics
D-Sai-Venkatesh Mar 24, 2026
d04a68a
Merge branch 'program-aware-plugin-test' of github.com:praveingk/llm-…
praveingk Mar 25, 2026
14e4586
Change fairness index to throughput
praveingk Mar 25, 2026
ddefd82
Add throughput based scoring strategy
praveingk Mar 26, 2026
7076818
Add ewma of throughput
praveingk Mar 26, 2026
3b44141
Modify strategy from throughput to attained service
praveingk Mar 26, 2026
811b647
Add time-based decay
praveingk Mar 27, 2026
1993d34
Remove EWMA
praveingk Mar 27, 2026
2058888
Merge branch 'main' into program-aware-plugin
praveingk Apr 6, 2026
63b36ab
Fix broken imports and syntax errors
D-Sai-Venkatesh Apr 6, 2026
9392bfa
Add round-robin (RR) scheduling strategy with tests
D-Sai-Venkatesh Apr 7, 2026
01bf665
Simplify ScoringStrategy interface (8 methods to 3)
D-Sai-Venkatesh Apr 7, 2026
9b4ca5a
Change Pick input from slice to map[string]QueueInfo
D-Sai-Venkatesh Apr 7, 2026
edb5acf
Simplify RRStrategy Pick to cursor walk
D-Sai-Venkatesh Apr 7, 2026
cd8a492
Rename ServiceStrategy to LASStrategy and fix typos CI
D-Sai-Venkatesh Apr 8, 2026
6445ddf
Merge pull request #4 from praveingk/simplify-strategy-interface
praveingk Apr 9, 2026
9a17d36
Fix DRR double quantum allocation across Pick() calls
D-Sai-Venkatesh Apr 9, 2026
8e0fdf6
Change OnCompleted to accept request and response instead of raw tokens
D-Sai-Venkatesh Apr 9, 2026
0996e01
Add deferRRCursor option to defer RR cursor advance to OnPreRequest
D-Sai-Venkatesh Apr 9, 2026
64f15fe
Merge pull request #5 from praveingk/drr-prerequest-fix
praveingk Apr 10, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions deploy/config/sim-program-aware-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
apiVersion: inference.networking.x-k8s.io/v1alpha1
kind: EndpointPickerConfig
plugins:
- type: program-aware-fairness
- type: queue-scorer
- type: max-score-picker
- type: single-profile-handler

featureGates:
- flowControl
- prepareDataPlugins

flowControl:
defaultPriorityBand:
fairnessPolicyRef: program-aware-fairness

schedulingProfiles:
- name: default
plugins:
- pluginRef: queue-scorer
- pluginRef: max-score-picker
10 changes: 7 additions & 3 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ import (
"github.com/prometheus/client_golang/prometheus"
compbasemetrics "k8s.io/component-base/metrics"
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/metrics"

programaware "github.com/llm-d/llm-d-inference-scheduler/pkg/plugins/program-aware"
)

const (
Expand All @@ -31,9 +33,11 @@ var (

// GetCollectors returns all custom collectors for the llm-d-inference-scheduler.
func GetCollectors() []prometheus.Collector {
return []prometheus.Collector{
SchedulerPDDecisionCount,
}
extra := programaware.GetCollectors()
collectors := make([]prometheus.Collector, 0, 1+len(extra))
collectors = append(collectors, SchedulerPDDecisionCount)
collectors = append(collectors, extra...)
return collectors
}

// RecordPDDecision increments the counter for a specific P/D routing decision.
Expand Down
196 changes: 196 additions & 0 deletions pkg/plugins/program-aware/plugin.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
// Package programaware implements a flow-control fairness policy that schedules
// programs using their accumulated metrics using scoring strategies (EWMA or DRR).
package programaware

import (
"context"
"encoding/json"
"fmt"
"math"
"sync"
"time"

"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/framework/interface/flowcontrol"
"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/framework/interface/plugin"
requestcontrol "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/framework/interface/requestcontrol"
)

const (
// ProgramAwarePluginType is the registered type name for this plugin.
ProgramAwarePluginType = "program-aware-fairness"

// fairnessIDHeader is the standard header used to identify the program.
fairnessIDHeader = "x-gateway-inference-fairness-id"
)

// Config holds the JSON-decoded configuration for the plugin.
type Config struct {
// Strategy selects the fairness scoring algorithm used by Pick().
// Valid values: "ewma" (default), "drr".
//
// "ewma" — head-of-queue age + EWMA historical wait + dispatch-count penalty.
// Practical heuristic; strong starvation prevention.
//
// "drr" — Deficit Round Robin adapted for tokens [Shreedhar & Varghese 1995].
// Each round every active queue earns a token quantum; actual token
// usage is deducted at response completion. Provides provably
// proportional fairness independent of request rate or size.
Strategy string `json:"strategy"`
}

// Compile-time interface assertions.
var (
_ flowcontrol.FairnessPolicy = &ProgramAwarePlugin{}
_ requestcontrol.PrepareDataPlugin = &ProgramAwarePlugin{}
_ requestcontrol.PreRequest = &ProgramAwarePlugin{}
_ requestcontrol.ResponseComplete = &ProgramAwarePlugin{}
)

// ProgramAwarePluginFactory creates a new ProgramAwarePlugin from JSON config.
// Example config: {"strategy": "drr"}
//
//nolint:revive
func ProgramAwarePluginFactory(name string, rawCfg json.RawMessage, _ plugin.Handle) (plugin.Plugin, error) {
cfg := Config{Strategy: "ewma"}
if len(rawCfg) > 0 {
if err := json.Unmarshal(rawCfg, &cfg); err != nil {
return nil, fmt.Errorf("invalid config for %s plugin %q: %w", ProgramAwarePluginType, name, err)
}
}
strategy, err := newStrategy(cfg.Strategy)
if err != nil {
return nil, fmt.Errorf("%s plugin %q: %w", ProgramAwarePluginType, name, err)
}
return &ProgramAwarePlugin{
name: name,
strategy: strategy,
}, nil
}

// ProgramAwarePlugin implements a FairnessPolicy that selects which program's
// queue to service next, and request lifecycle hooks that track per-program metrics.
//
// Fairness behaviour is determined by the configured ScoringStrategy (default: EWMA).
// Program identity comes from the x-gateway-inference-fairness-id request header.
//
//nolint:revive
type ProgramAwarePlugin struct {
name string
strategy ScoringStrategy

// programMetrics stores aggregated metrics per program.
// Key: program ID (string), Value: *ProgramMetrics.
programMetrics sync.Map

// requestTimestamps tracks when Pick() dispatched each request,
// used to compute flow-control queue wait time in PreRequest.
// Key: request ID (string), Value: time.Time.
requestTimestamps sync.Map
}

// TypedName returns the plugin type and instance name.
func (p *ProgramAwarePlugin) TypedName() plugin.TypedName {
return plugin.TypedName{
Type: ProgramAwarePluginType,
Name: p.name,
}
}

// getStrategy returns the configured strategy, falling back to EWMA for zero-value
// plugin instances constructed directly in tests.
func (p *ProgramAwarePlugin) getStrategy() ScoringStrategy {
if p.strategy == nil {
return &EWMAStrategy{}
}
return p.strategy
}

// --- FairnessPolicy interface ---

// NewState creates per-PriorityBand state. This plugin uses its own sync.Map
// for all state, so no per-band state is needed.
func (p *ProgramAwarePlugin) NewState(_ context.Context) any {
return nil
}

// Pick selects which program queue to service next.
//
// For each queue in the band, the configured ScoringStrategy is given a chance
// to update its per-program state (OnPickStart), then the queue with the highest
// score is selected for dispatch.
func (p *ProgramAwarePlugin) Pick(_ context.Context, band flowcontrol.PriorityBandAccessor) (flowcontrol.FlowQueueAccessor, error) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

question: while it's true you have no consumers of the ctx in this method. should we perform this work if ctx.Err()?

start := time.Now()
defer func() {
pickLatencyUs.Observe(float64(time.Since(start).Microseconds()))
}()

if band == nil {
return nil, nil //nolint:nilnil
}
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

question: nil band is not an error? this means the caller even when the err == nil must nil check the return is nil. also won't this give inaccurate latency metrics for work that was not done?

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We return (nil, nil) instead of (nil, err) because this is how the contract for fairness policy has been defined. The existing Fairness plugins also follow the same convention for example, round-robin.

From the behaviour point of view, both cases have same behaviour, that is no item will be dispatched from that band. But in error case the error will be logged.


var bestQueue flowcontrol.FlowQueueAccessor
bestScore := -1.0
strategy := p.getStrategy()

band.IterateQueues(func(queue flowcontrol.FlowQueueAccessor) (keepIterating bool) {
if queue == nil {
return true
}

queueLen := queue.Len()
metrics := p.getOrCreateMetrics(queue.FlowKey().ID)

// Strategy hook: runs for every queue, including empty ones.
// DRR: allocates quantum for active queues, resets deficit for idle queues.
// EWMA: no-op.
strategy.OnPickStart(queue.FlowKey().ID, queueLen, metrics)

if queueLen == 0 {
return true
}

score := p.scoreQueue(queue)
if score > bestScore {
bestScore = score
bestQueue = queue
}
return true
})

// Record the selected item's enqueue time so PreRequest can compute
// the actual flow-control queue wait time (enqueue → dispatch).
if bestQueue != nil {
if head := bestQueue.PeekHead(); head != nil {
p.requestTimestamps.Store(head.OriginalRequest().ID(), head.EnqueueTime())
}
}

return bestQueue, nil
}

// scoreQueue delegates to the configured ScoringStrategy.
func (p *ProgramAwarePlugin) scoreQueue(queue flowcontrol.FlowQueueAccessor) float64 {
var metrics *ProgramMetrics
if metricsRaw, ok := p.programMetrics.Load(queue.FlowKey().ID); ok {
metrics = metricsRaw.(*ProgramMetrics)
}
return p.getStrategy().ScoreQueue(queue, metrics)
}

// getOrCreateMetrics returns the ProgramMetrics for the given program ID, creating if needed.
func (p *ProgramAwarePlugin) getOrCreateMetrics(programID string) *ProgramMetrics {
if metricsRaw, ok := p.programMetrics.Load(programID); ok {
return metricsRaw.(*ProgramMetrics)
}
m := &ProgramMetrics{}
actual, _ := p.programMetrics.LoadOrStore(programID, m)
return actual.(*ProgramMetrics)
}

// normalize clamps v/cap to [0, 1].
func normalize(v, cap float64) float64 {
if cap <= 0 {
return 0
}
return math.Min(math.Max(v/cap, 0), 1)
}
Loading
Loading