opendatahub-io
diff --git a/‎.github/ISSUE_TEMPLATE/new-release.md‎
Lines changed: 12 additions & 12 deletions b/‎.github/ISSUE_TEMPLATE/new-release.md‎
Lines changed: 12 additions & 12 deletions
diff --git a/‎.github/actions/docker-build-and-push/action.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/actions/docker-build-and-push/action.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.tekton/llm-d-inference-scheduler-pull-request.yaml‎
Lines changed: 0 additions & 52 deletions b/‎.tekton/llm-d-inference-scheduler-pull-request.yaml‎
Lines changed: 0 additions & 52 deletions
diff --git a/‎.tekton/llm-d-inference-scheduler-push.yaml‎
Lines changed: 0 additions & 46 deletions b/‎.tekton/llm-d-inference-scheduler-push.yaml‎
Lines changed: 0 additions & 46 deletions
diff --git a/‎AGENTS.md‎
Lines changed: 29 additions & 0 deletions b/‎AGENTS.md‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎DEVELOPMENT.md‎
Lines changed: 88 additions & 2 deletions b/‎DEVELOPMENT.md‎
Lines changed: 88 additions & 2 deletions
diff --git a/‎cmd/epp/runner/runner.go‎
Lines changed: 7 additions & 1 deletion b/‎cmd/epp/runner/runner.go‎
Lines changed: 7 additions & 1 deletion
@@ -1,8 +1,8 @@
 ---
 name: New Release
 about: Propose a new release
-title: Release v0.x.0
-labels: ''
+title: Release vX.Y.Z
+labels: kind/release
 assignees: ''
 
 ---
@@ -49,7 +49,7 @@ This document defines the process for releasing llm-d-router.
 
 ### Create or Checkout branch 
 
-1. If you already have the repo cloned, ensure it’s up-to-date and your local branch is clean.
+1. If you already have the repo cloned, ensure it's up-to-date and your local branch is clean.
 
 1. Release Branch Handling:
    - For a Release Candidate:
@@ -63,7 +63,7 @@ This document defines the process for releasing llm-d-router.
      A release branch should already exist. In this case, check out the existing branch:
 
      ```shell
-     git checkout -b release-${MAJOR}.${MINOR} ${REMOTE}/release-${MAJOR}.${MINOR}
+     git checkout release-${MAJOR}.${MINOR} ${REMOTE}/release-${MAJOR}.${MINOR}
      ```
 
 1. Push your release branch to the llm-d-router remote.
@@ -79,13 +79,13 @@ This document defines the process for releasing llm-d-router.
    For a release candidate:
 
     ```shell
-    git tag -s -a v${MAJOR}.${MINOR}.${PATCH}-rc.${RC} -m 'llm-d-router v${MAJOR}.${MINOR}.${PATCH}-rc.${RC} Release Candidate'
+    git tag -s -a v${MAJOR}.${MINOR}.${PATCH}-rc.${RC} -m "llm-d-router v${MAJOR}.${MINOR}.${PATCH}-rc.${RC} Release Candidate"
     ```
 
    For a major, minor or patch release:
 
     ```shell
-    git tag -s -a v${MAJOR}.${MINOR}.${PATCH} -m 'llm-d-router v${MAJOR}.${MINOR}.${PATCH} Release'
+    git tag -s -a v${MAJOR}.${MINOR}.${PATCH} -m "llm-d-router v${MAJOR}.${MINOR}.${PATCH} Release"
     ```
 
 1. Push the tag to the llm-d-router repo.
@@ -102,16 +102,17 @@ This document defines the process for releasing llm-d-router.
     git push ${REMOTE} v${MAJOR}.${MINOR}.${PATCH}
     ```
 
-1. Pushing the tag triggers CI action to build and publish the [EPP image] and [sidecar image] to the [ghcr registry].
-1. Test the steps in the tagged quickstart guide after the PR merges. TODO add e2e tests! <!-- link to an e2e tests once we have such one -->
+1. Pushing the tag triggers CI action to build and publish the EPP image (`ghcr.io/llm-d/llm-d-router-endpoint-picker`) and sidecar image (`ghcr.io/llm-d/llm-d-router-disagg-sidecar`) to the [ghcr registry].
+1. Verify the [CI release workflow] completed successfully before proceeding.
+1. Test the steps in the tagged quickstart guide after the PR merges.
 
 ### Create the release!
 
 1. Create a [new release]:
     1. Choose the tag that you created for the release.
-    1. Use the tag as the release title, i.e. `v0.1.0` refer to previous release for the content of the release body.
+    1. Use the tag as the release title, e.g. `v0.1.0`.
     1. Click "Generate release notes" and preview the release body.
-    1. Go to Gateway Inference Extension latest release and make sure to include the highlights in llm-d-router as well.
+    1. Ensure the release body includes: highlights, breaking changes (if any), known issues, and upgrade steps.
     1. If this is a release candidate, select the "This is a pre-release" checkbox.
 1. If you find any bugs in this process, create an [issue].
 
@@ -131,7 +132,6 @@ Use the following steps to announce the release.
 
 [repo]: https://github.com/llm-d/llm-d-router
 [ghcr registry]: https://github.com/orgs/llm-d/packages?repo_name=llm-d-router
-[EPP image]: https://github.com/llm-d/llm-d-router/pkgs/container/llm-d-router-endpoint-picker
-[sidecar image]: https://github.com/llm-d/llm-d-router/pkgs/container/llm-d-router-disagg-sidecar
 [new release]: https://github.com/llm-d/llm-d-router/releases/new
 [issue]: https://github.com/llm-d/llm-d-router/issues/new/choose
+[CI release workflow]: https://github.com/llm-d/llm-d-router/actions/workflows/ci-release.yaml
@@ -57,6 +57,7 @@ runs:
         tags: |
           ${{ inputs.registry }}/${{ inputs.image-name }}:${{ inputs.tag }}
           ${{ inputs.push == 'true' && inputs.prerelease != 'true' && format('{0}/{1}:latest', inputs.registry, inputs.image-name) || '' }}
+          ${{ inputs.commit-sha != '' && format('{0}/{1}:{2}', inputs.registry, inputs.image-name, inputs.commit-sha) || '' }}
         build-args: |
           LDFLAGS=-s -w
           COMMIT_SHA=${{ inputs.commit-sha || 'unknown' }}
 
@@ -39,6 +39,35 @@ llm-d Router. Go service that routes inference requests to model-serving pods vi
 - State each fact once, in its canonical location. Do not duplicate across struct docs, prose, tables, inline comments, and examples.
 - Do not use Unicode symbols or special characters in general, unless explicitly requested.
 
+### Logging
+
+The codebase uses `go-logr` via controller-runtime. Verbosity constants are defined in `pkg/common/observability/logging` (`DEFAULT=2`, `VERBOSE=3`, `DEBUG=4`, `TRACE=5`).
+
+**Level conventions:**
+
+- `logger.Info(...)` for once-per-request operational signals.
+- `logger.V(logging.DEBUG).Info(...)` for per-item or per-loop signals that fire multiple times per request.
+- `logger.V(logging.TRACE).Info(...)` for detailed state transitions (cache operations, index updates).
+- `logger.Error(err, "msg", ...)` for recoverable errors that carry an underlying `error` value.
+
+**Use named constants, not bare integers:**
+
+```go
+// wrong
+logger.V(4).Info("running protocol", ...)
+
+// correct
+logger.V(logging.DEBUG).Info("running protocol", ...)
+```
+
+**Guard expensive log construction:**
+
+```go
+if v := logger.V(logging.DEBUG); v.Enabled() {
+    v.Info("payload details", "data", expensiveSerialization())
+}
+```
+
 ## Git workflow
 
 - DCO sign-off is required. Use `git commit -s`.
 
@@ -40,6 +40,10 @@ Documentation for developing the llm-d Router.
     - [Environment Configuration](#environment-configuration)
     - [Deploying Changes](#deploying-changes)
     - [Cleanup Environment](#cleanup-environment)
+  - [Logging](#logging)
+    - [Change log verbosity](#change-log-verbosity)
+    - [Add logs](#add-logs)
+    - [Passing Logger Around](#passing-logger-around)
   - [Submitting Changes](#submitting-changes)
     - [Scope](#scope)
     - [Presubmit](#presubmit)
@@ -169,14 +173,15 @@ PROM_ENABLED=true KIND_PROM_HOST_PORT=30091 make env-dev-kind
 
 ### Grafana Dashboard
 
-The upstream [Inference Gateway dashboard] covers EPP, inference pool, and vLLM metrics.
+The bundled [Inference Gateway dashboard] covers EPP metrics across the inference
+pool, inference objective, and flow control layers.
 
 Add a Prometheus datasource at `http://localhost:30090`, then import the JSON via
 **Dashboards > New > Import**. See the
 [Grafana installation docs](https://grafana.com/docs/grafana/latest/setup-grafana/installation/)
 for setup.
 
-[Inference Gateway dashboard]:https://github.com/kubernetes-sigs/gateway-api-inference-extension/blob/main/tools/dashboards/inference_gateway.json
+[Inference Gateway dashboard]:deploy/grafana/inference_gateway.json
 
 > [!NOTE]
 > For significant customization beyond the standard deployment, use the `deploy/components`
@@ -877,6 +882,87 @@ helm uninstall kgateway-crds -n kgateway-system
 For more details, see the Gateway API Inference Extension
 [getting started guide](https://gateway-api-inference-extension.sigs.k8s.io/guides/).
 
+## Logging
+
+We use `logr.Logger` interface for logging everywhere.
+The logger instance is loaded from `context.Context` or passed around as an argument directly.
+This is aligned with contextual logging as explained in [k8s instrumentation logging guidelines](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-instrumentation/logging.md).
+
+In other words, we explicitly don't use `klog` global logging calls.
+Using `klog` log value helpers like `klog.KObj` is just fine.
+
+### Change log verbosity
+
+We generally follow the [k8s instrumentation logging guidelines](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-instrumentation/logging.md), which states "the practical default level is V(2). Developers and QE environments may wish to run at V(3) or V(4)".
+
+To configure logging verbosity, specify the `v` flag such as `--v=2`.
+
+If `--v` is not set explicitly, the default verbosity is V(2) (`DEFAULT`).
+### Add logs
+
+The [k8s instrumentation logging guidelines](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-instrumentation/logging.md) have the following definitions:
+
+- `logger.V(0).Info` = `logger.Info` - Generally useful for this to **always** be visible to a cluster operator
+- `logger.V(1).Info` - A reasonable default log level if you don't want verbosity.
+- `logger.V(2).Info` - Useful steady state information about the service and important log messages that may correlate to significant changes in the system. This is the recommended default log level for most systems.
+- `logger.V(3).Info` - Extended information about changes
+- `logger.V(4).Info` - Debug level verbosity
+- `logger.V(5).Info` - Trace level verbosity
+
+We choose to simplify to the following 4 common levels.
+
+```go
+const (
+	DEFAULT = 2
+	VERBOSE = 3
+	DEBUG   = 4
+	TRACE   = 5
+)
+```
+
+The guidelines are written in the context of a k8s controller. Our [epp](pkg/epp/) does more things such as handling requests and scraping metrics, therefore we adapt the guidelines as follows:
+
+1. The server startup process and configuration.
+
+   - `logger.Info` Logging at the `V(0)` verbosity level is generally welcome here as this is only logged once at startup, and provides useful info for debugging.
+
+2. Reconciler loops. The reconciler loops watch for CR changes such as the `InferenceObjective` CR. And given changes in these CRs significantly affect the behavior of the extension, we recommend using `V(DEFAULT)` verbosity level as default, and sparsely use higher verbosity levels.
+
+   - `logger.V(DEFAULT)`
+     - Default log level in the reconcilers.
+     - Information about config (listening on X, watching Y)
+     - Errors that repeat frequently that relate to conditions that can be corrected (e.g., inference model not initialized yet)
+     - System state changing (adding/removing objects in the data store)
+   - `logger.V(VERBOSE)` and above: Use your best judgement.
+
+3. Inference request handling. These requests are expected to be much higher volume than the control flow in the reconcilers and therefore we should be mindful of log spamming. We recommend using v=2 to log important info about a request, such as the HTTP response code, and higher verbosity levels for less important info.
+
+   - `logger.V(DEFAULT)`
+     - Logging the status code of an HTTP request
+     - Important decision making such as picking the target model, target pod
+   - `logger.V(VERBOSE)`
+     - Detailed request scheduling algorithm operations, such as running the filtering logic
+   - `logger.V(DEBUG)` and above: Use your best judgement.
+
+4. Metric scraping loops. These loops run at a very high frequency, and logs can be very spammy if not handled properly.
+
+   - `logger.V(TRACE)`
+     - Transient errors/warnings, such as failure to get response from a pod.
+     - Important state changes, such as updating a metric.
+
+5. Misc
+   1. Periodic (every 5s) debug loop which prints the current pods and metrics.
+      - `logger.V(DEFAULT).Error` If the metrics are not fresh enough, which indicates an error occurred during the metric scraping loop.
+      - `logger.V(DEBUG)`
+        - This is very important to debug the request scheduling algorithm, and yet not spammy compared to the metric scraping loop logs.
+
+### Passing Logger Around
+
+You can pass around a `context.Context` that contains a logger or a `logr.Logger` instance directly.
+You need to make the call which one to use. Passing a `context.Context` is more standard, on the other hand you then need to call `log.FromContext` everywhere.
+
+As `logger.V` calls are cumulative, i.e. `logger.V(2).V(3)` results in `logger.V(5)`, a logger should be passed around with no verbosity level set so that `logger.V(DEFAULT)` actually uses `DEFAULT` verbosity level.
+
 ## Submitting Changes
 
 Read the [llm-d organization contributing guide](https://github.com/llm-d/llm-d/blob/main/CONTRIBUTING.md)
 
@@ -72,6 +72,7 @@ import (
 	srcmodels "github.com/llm-d/llm-d-router/pkg/epp/framework/plugins/datalayer/source/models"
 	sourcenotifications "github.com/llm-d/llm-d-router/pkg/epp/framework/plugins/datalayer/source/notifications"
 	"github.com/llm-d/llm-d-router/pkg/epp/framework/plugins/flowcontrol/fairness/globalstrict"
+	programaware "github.com/llm-d/llm-d-router/pkg/epp/framework/plugins/flowcontrol/fairness/program-aware"
 	"github.com/llm-d/llm-d-router/pkg/epp/framework/plugins/flowcontrol/fairness/roundrobin"
 	"github.com/llm-d/llm-d-router/pkg/epp/framework/plugins/flowcontrol/ordering/edf"
 	"github.com/llm-d/llm-d-router/pkg/epp/framework/plugins/flowcontrol/ordering/fcfs"
@@ -99,6 +100,7 @@ import (
 	"github.com/llm-d/llm-d-router/pkg/epp/framework/plugins/requesthandling/parsers/vllmhttp"
 	"github.com/llm-d/llm-d-router/pkg/epp/framework/plugins/scheduling/filter/bylabel"
 	"github.com/llm-d/llm-d-router/pkg/epp/framework/plugins/scheduling/filter/prefixcacheaffinity"
+	sessionaffinityfilter "github.com/llm-d/llm-d-router/pkg/epp/framework/plugins/scheduling/filter/sessionaffinity"
 	"github.com/llm-d/llm-d-router/pkg/epp/framework/plugins/scheduling/filter/sloheadroomtier"
 	"github.com/llm-d/llm-d-router/pkg/epp/framework/plugins/scheduling/picker/maxscore"
 	"github.com/llm-d/llm-d-router/pkg/epp/framework/plugins/scheduling/picker/random"
@@ -217,7 +219,7 @@ func (r *Runner) Run(ctx context.Context) error {
 	logutil.InitLogging(&opts.ZapOptions)
 
 	if opts.Tracing {
-		shutdown, err := tracing.InitTracing(ctx, setupLog, "llm-d-router/epp")
+		shutdown, err := tracing.InitTracing(ctx, setupLog, "llm-d-epp")
 		if err != nil {
 			return fmt.Errorf("failed to init tracing %w", err)
 		}
@@ -400,6 +402,7 @@ func (r *Runner) setup(ctx context.Context, cfg *rest.Config, opts *runserver.Op
 		PriorityBandControlPlane:         priorityBandControlPlane,
 		GRPCMaxRecvMsgSize:               opts.GRPCMaxRecvMsgSize,
 		GRPCMaxSendMsgSize:               opts.GRPCMaxSendMsgSize,
+		EnableGRPCStreamMetrics:          opts.EnableGRPCStreamMetrics,
 	}
 
 	if err := serverRunner.SetupWithManager(mgr); err != nil {
@@ -483,6 +486,7 @@ func (r *Runner) registerInTreePlugins() {
 	fwkplugin.Register(bylabel.EncodeRoleType, bylabel.EncodeRoleFactory)
 	fwkplugin.Register(bylabel.DecodeRoleType, bylabel.DecodeRoleFactory)
 	fwkplugin.Register(bylabel.PrefillRoleType, bylabel.PrefillRoleFactory)
+	fwkplugin.Register(sessionaffinityfilter.SessionAffinityType, sessionaffinityfilter.Factory)
 
 	// dataparallel profile handler
 	fwkplugin.Register(dataparallel.DataParallelProfileHandlerType, dataparallel.ProfileHandlerFactory)
@@ -522,6 +526,7 @@ func (r *Runner) registerInTreePlugins() {
 	// Flow Control plugins
 	fwkplugin.Register(globalstrict.GlobalStrictFairnessPolicyType, globalstrict.GlobalStrictFairnessPolicyFactory)
 	fwkplugin.Register(roundrobin.RoundRobinFairnessPolicyType, roundrobin.RoundRobinFairnessPolicyFactory)
+	fwkplugin.Register(programaware.ProgramAwarePluginType, programaware.ProgramAwarePluginFactory)
 	fwkplugin.Register(fcfs.FCFSOrderingPolicyType, fcfs.FCFSOrderingPolicyFactory)
 	fwkplugin.Register(edf.EDFOrderingPolicyType, edf.EDFOrderingPolicyFactory)
 	fwkplugin.Register(slodeadline.SLODeadlineOrderingPolicyType, slodeadline.SLODeadlineOrderingPolicyFactory)
@@ -943,6 +948,7 @@ func (r *Runner) runWithFileDiscovery(ctx context.Context, opts *runserver.Optio
 		SaturationDetector:               eppConfig.SaturationDetector,
 		GRPCMaxRecvMsgSize:               opts.GRPCMaxRecvMsgSize,
 		GRPCMaxSendMsgSize:               opts.GRPCMaxSendMsgSize,
+		EnableGRPCStreamMetrics:          opts.EnableGRPCStreamMetrics,
 	}
 
 	r.customCollectors = append(r.customCollectors, collectors.NewInferencePoolMetricsCollector(ds))