opendatahub-io · zdtsw · Jun 17, 2026 · Jun 16, 2026 · Jun 16, 2026 · Jun 16, 2026
diff --git a/.github/ISSUE_TEMPLATE/new-release.md b/.github/ISSUE_TEMPLATE/new-release.md
@@ -25,19 +25,26 @@ This document defines the process for releasing llm-d-router.
    `refs/tags/v*` restricts who can push release tags, which is what triggers
    the release build.
 
-1. Set the required environment variables based on the expected release number:
+1. Choose whether you are releasing a release candidate or an official release, and set the environment variables accordingly:
 
-   ```shell
-   export MAJOR=0
-   export MINOR=1
-   export PATCH=0
-   export REMOTE=origin
-   ```
+   - For a **Release Candidate** (e.g. `v0.9.0-rc.1`):
+     ```shell
+     export VERSION=v0.9.0-rc.1
+     export BRANCH_VERSION=0.9
+     export REMOTE=origin
+     ```
 
-1. If creating a release candidate, set the release candidate number.
+   - For an **Official Release** (e.g. `v0.9.0`):
+     ```shell
+     export VERSION=v0.9.0
+     export BRANCH_VERSION=0.9
+     export REMOTE=origin
+     ```
+
+1. (Optional) If the latency predictor release version does **not** align with the router version, also set the expected tag (refer to the [latency predictor releases] to find the latest valid release tag):
 
    ```shell
-   export RC=1
+   export LATENCY_PREDICTOR_TAG=v0.8.0-rc.1
    ```
 1. If needed, clone the llm-d-router [repo].
 
@@ -53,54 +60,46 @@ This document defines the process for releasing llm-d-router.
 
 1. Release Branch Handling:
    - For a Release Candidate:
-     Create a new release branch from the `main` branch. The branch should be named `release-${MAJOR}.${MINOR}`, for example, `release-0.1`:
+     Create a new release branch from the `main` branch. The branch should be named `release-${BRANCH_VERSION}`, for example, `release-0.9`:
 
      ```shell
-     git checkout -b release-${MAJOR}.${MINOR}
+     git checkout -b release-${BRANCH_VERSION}
      ```
 
    - For a Major, Minor or Patch Release:
      A release branch should already exist. In this case, check out the existing branch:
 
      ```shell
-     git checkout release-${MAJOR}.${MINOR} ${REMOTE}/release-${MAJOR}.${MINOR}
+     git checkout release-${BRANCH_VERSION} ${REMOTE}/release-${BRANCH_VERSION}
      ```
 
-1. Push your release branch to the llm-d-router remote.
+1. By default, `LATENCY_PREDICTOR_TAG` in the `Makefile` resolves from the router release tag (via `BUILD_REF`). If the latency predictor tag does **not** align with the router version, update the default value of `LATENCY_PREDICTOR_TAG` in the `Makefile` to match your exported `${LATENCY_PREDICTOR_TAG}`.
+   Commit the change (if modified):
 
     ```shell
-    git push ${REMOTE} release-${MAJOR}.${MINOR}
+    # Update LATENCY_PREDICTOR_TAG ?= vX.Y.Z in Makefile
+    git commit -a -s -m "release: set LATENCY_PREDICTOR_TAG to ${LATENCY_PREDICTOR_TAG}"
     ```
 
-### Tag commit and trigger image build
-
-1. Tag the head of your release branch with the sem-ver release version.
-
-   For a release candidate:
-
-    ```shell
-    git tag -s -a v${MAJOR}.${MINOR}.${PATCH}-rc.${RC} -m "llm-d-router v${MAJOR}.${MINOR}.${PATCH}-rc.${RC} Release Candidate"
-    ```
-
-   For a major, minor or patch release:
+1. Push your release branch to the llm-d-router remote.
 
     ```shell
-    git tag -s -a v${MAJOR}.${MINOR}.${PATCH} -m "llm-d-router v${MAJOR}.${MINOR}.${PATCH} Release"
+    git push ${REMOTE} release-${BRANCH_VERSION}
     ```
 
-1. Push the tag to the llm-d-router repo.
+### Tag commit and trigger image build
 
-   For a release candidate:
+1. Tag the head of your release branch with the version:
 
-    ```shell
-    git push ${REMOTE} v${MAJOR}.${MINOR}.${PATCH}-rc.${RC}
-    ```
+     ```shell
+     git tag -s -a ${VERSION} -m "llm-d-router ${VERSION} Release"
+     ```
 
-   For a major, minor or patch release:
+1. Push the tag to the llm-d-router repo:
 
-    ```shell
-    git push ${REMOTE} v${MAJOR}.${MINOR}.${PATCH}
-    ```
+     ```shell
+     git push ${REMOTE} ${VERSION}
+     ```
 
 1. Pushing the tag triggers CI action to build and publish the EPP image (`ghcr.io/llm-d/llm-d-router-endpoint-picker`) and sidecar image (`ghcr.io/llm-d/llm-d-router-disagg-sidecar`) to the [ghcr registry].
 1. Verify the [CI release workflow] completed successfully before proceeding.
@@ -111,21 +110,49 @@ This document defines the process for releasing llm-d-router.
 1. Create a [new release]:
     1. Choose the tag that you created for the release.
     1. Use the tag as the release title, e.g. `v0.1.0`.
-    1. Click "Generate release notes" and preview the release body.
-    1. Ensure the release body includes: highlights, breaking changes (if any), known issues, and upgrade steps.
+    1. Click "Generate release notes" to auto-populate the list of PRs and contributors.
+    1. Summarize the release notes using an LLM of your choice (e.g., Gemini, Copilot, ChatGPT). Provide the newly compiled release notes block from `RELEASE-NOTES.md` (or the unreleased fragments in `release-notes.d/unreleased/`) with the following prompt:
+
+       ```text
+       Please summarize these release notes into three clear sections:
+       1. Highlights (key features, performance wins, bug fixes)
+       2. Upgrade Steps & Deprecations (configuration changes, deprecated flags/metrics)
+       3. Known Issues (if any, otherwise omit)
+       ```
+
+       Review the generated content, edit it if necessary to ensure accuracy, and then copy and prepend this summary at the very top of the release description box on GitHub.
     1. If this is a release candidate, select the "This is a pre-release" checkbox.
 1. If you find any bugs in this process, create an [issue].
 
 ## Announce the Release
 
 Use the following steps to announce the release.
 
-1. Send an announcement email to `llm-d-contributors@googlegroups.com` with the subject:
+1. Generate the announcement email content by running the following block in your terminal (make sure `${VERSION}` is set in your current shell):
 
    ```shell
-   [ANNOUNCE] llm-d-router v${MAJOR}.${MINOR}.${PATCH} is released
+   cat <<EOF
+   Subject: [ANNOUNCE] llm-d-router ${VERSION} is released
+
+   Hi all,
+
+   We are pleased to announce the release of llm-d-router ${VERSION}!
+
+   ### Container Images
+   * Endpoint Picker: ghcr.io/llm-d/llm-d-router-endpoint-picker:${VERSION}
+   * Disaggregated Sidecar: ghcr.io/llm-d/llm-d-router-disagg-sidecar:${VERSION}
+
+   ### Helm Charts (OCI)
+   * Standalone Chart: oci://ghcr.io/llm-d/charts/llm-d-router-standalone (version ${VERSION})
+   * Gateway Chart: oci://ghcr.io/llm-d/charts/llm-d-router-gateway (version ${VERSION})
+
+   ### Release Notes
+   For more details, please see the GitHub release notes: https://github.com/llm-d/llm-d-router/releases/tag/${VERSION}
+   EOF
    ```
 
+1. Copy the generated subject and body, and send an email to `llm-d-contributors@googlegroups.com`.
+
 1. Add a link to the final release in this issue.
 
 1. Close this issue.
@@ -135,3 +162,4 @@ Use the following steps to announce the release.
 [new release]: https://github.com/llm-d/llm-d-router/releases/new
 [issue]: https://github.com/llm-d/llm-d-router/issues/new/choose
 [CI release workflow]: https://github.com/llm-d/llm-d-router/actions/workflows/ci-release.yaml
+[latency predictor releases]: https://github.com/orgs/llm-d/packages?repo_name=llm-d-latency-predictor
diff --git a/Makefile b/Makefile
@@ -55,6 +55,7 @@ GIT_COMMIT_SHA ?= $(shell git rev-parse HEAD 2>/dev/null)
 # Match only root-level release tags (v[0-9]*) so submodule tags don't leak into image versions.
 ROOT_RELEASE_TAG_MATCH ?= v[0-9]*
 BUILD_REF ?= $(shell git describe --tags --match '$(ROOT_RELEASE_TAG_MATCH)' --abbrev=0 2>/dev/null)
+LATENCY_PREDICTOR_TAG ?= $(or $(EXTRA_TAG),$(BUILD_REF),latest)
 
 # Host directories for Go module and build caches, bind-mounted into the builder container.
 GO_MOD_CACHE_VOL ?= $(HOME)/.cache/llm-d-gomodcache
@@ -214,7 +215,7 @@ check-latest-tags-strict: ## Check ':latest' image tags in YAML (strict; fails o
 
 .PHONY: presubmit
 presubmit: LINT_NEW_ONLY=true
-presubmit: git-branch-check signed-commits-check go-mod-check format lint vulncheck check-latest-tags
+presubmit: git-branch-check signed-commits-check go-mod-check format lint vulncheck check-latest-tags-strict
 
 .PHONY: git-branch-check
 git-branch-check:
@@ -348,7 +349,7 @@ verify-helm-charts: helm-install kubectl-validate ## Render and validate Helm ch
 .PHONY: helm-push
 helm-push: yq helm-install ## Package and push a specified Helm chart. Usage: make helm-push CHART=<chart_name>
 	@if [ -z "$(CHART)" ]; then echo "Error: CHART variable is required (e.g. CHART=llm-d-router-standalone)"; exit 1; fi
-	CHART=$(CHART) EXTRA_TAG="$(EXTRA_TAG)" CHART_SUFFIX="$(CHART_SUFFIX)" EPP_RELEASE_IMAGE_REPOSITORY="$(EPP_RELEASE_IMAGE_REPOSITORY)" YQ="$(YQ)" HELM="$(HELM)" ./hack/push-chart.sh
+	CHART=$(CHART) EXTRA_TAG="$(EXTRA_TAG)" CHART_SUFFIX="$(CHART_SUFFIX)" EPP_RELEASE_IMAGE_REPOSITORY="$(EPP_RELEASE_IMAGE_REPOSITORY)" LATENCY_PREDICTOR_TAG="$(LATENCY_PREDICTOR_TAG)" YQ="$(YQ)" HELM="$(HELM)" ./hack/push-chart.sh
 
 .PHONY: helm-push-gateway
 helm-push-gateway: ## Package and push the llm-d-router-gateway Helm chart.

diff --git a/config/manifests/sglang/gpu-deployment.yaml b/config/manifests/sglang/gpu-deployment.yaml
@@ -17,7 +17,7 @@ spec:
     spec:
       containers:
         - name: sglang
-          image: lmsysorg/sglang:latest
+          image: lmsysorg/sglang:v0.5.12
           command: ["python3", "-m", "sglang.launch_server"]
           args:
             - "--model-path=Qwen/Qwen3-32B"

diff --git a/config/manifests/vllm/gpu-deployment.yaml b/config/manifests/vllm/gpu-deployment.yaml
@@ -15,7 +15,7 @@ spec:
     spec:
       containers:
         - name: vllm
-          image: "vllm/vllm-openai:latest"
+          image: "vllm/vllm-openai:v0.21.0"
           imagePullPolicy: Always
           command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
           args:

diff --git a/config/manifests/vllm/gpu-grpc-deployment.yaml b/config/manifests/vllm/gpu-grpc-deployment.yaml
@@ -15,7 +15,7 @@ spec:
     spec:
       containers:
         - name: vllm-server
-          image: vllm/vllm-openai:latest
+          image: vllm/vllm-openai:v0.21.0
           command: ["python3", "-m", "vllm.entrypoints.grpc_server"]
           args:
           - "--model"

diff --git a/config/manifests/vllm/gpu-multilora-deployment.yaml b/config/manifests/vllm/gpu-multilora-deployment.yaml
@@ -15,7 +15,7 @@ spec:
     spec:
       containers:
         - name: vllm
-          image: "vllm/vllm-openai:latest"
+          image: "vllm/vllm-openai:v0.21.0"
           imagePullPolicy: Always
           command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
           args:

diff --git a/config/manifests/vllm/gpu-prefix-cache-deployment.yaml b/config/manifests/vllm/gpu-prefix-cache-deployment.yaml
@@ -15,7 +15,7 @@ spec:
     spec:
       containers:
         - name: vllm
-          image: "vllm/vllm-openai:latest"
+          image: "vllm/vllm-openai:v0.21.0"
           imagePullPolicy: Always
           command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
           args:

diff --git a/pkg/epp/framework/plugins/requestcontrol/dataproducer/preciseprefixcache/producer.go b/pkg/epp/framework/plugins/requestcontrol/dataproducer/preciseprefixcache/producer.go
@@ -229,7 +229,7 @@ func (p *Producer) Produce(ctx context.Context,
 	)
 	defer span.End()
 
-	span.SetAttributes(attribute.Int("llm_d.producer.candidate_endpoints", len(endpoints)))
+	span.SetAttributes(attribute.Int("llm_d.epp.producer.candidate_endpoints", len(endpoints)))
 	if request != nil {
 		if request.TargetModel != "" {
 			span.SetAttributes(attribute.String("gen_ai.request.model", request.TargetModel))
@@ -245,7 +245,7 @@ func (p *Producer) Produce(ctx context.Context,
 		return fmt.Errorf("failed to compute block keys: %w", err)
 	}
 	if len(perPromptKeys) == 0 {
-		span.SetAttributes(attribute.String("llm_d.producer.result", "skipped_no_tokens"))
+		span.SetAttributes(attribute.String("llm_d.epp.producer.result", "skipped_no_tokens"))
 		return nil
 	}
 
@@ -310,8 +310,8 @@ func (p *Producer) produceFromBlockKeys(ctx context.Context, span trace.Span,
 	}
 
 	span.SetAttributes(
-		attribute.Int("llm_d.producer.total_blocks", totalBlocks),
-		attribute.Int("llm_d.producer.max_match_blocks", maxMatch),
+		attribute.Int("llm_d.epp.producer.total_blocks", totalBlocks),
+		attribute.Int("llm_d.epp.producer.max_match_blocks", maxMatch),
 	)
 
 	logger.V(logging.TRACE).Info("Produce completed",

diff --git a/pkg/epp/framework/plugins/scheduling/profilehandler/disagg/disagg_profile_handler.go b/pkg/epp/framework/plugins/scheduling/profilehandler/disagg/disagg_profile_handler.go
@@ -271,7 +271,7 @@ func (h *Handler) Pick(ctx context.Context, request *scheduling.InferenceRequest
 	defer span.End()
 
 	if request == nil {
-		span.SetAttributes(attribute.String("llm_d.profile_handler.decision", "complete_nil_request"))
+		span.SetAttributes(attribute.String("llm_d.epp.profile_handler.decision", "complete_nil_request"))
 		return map[string]scheduling.SchedulerProfile{}
 	}
 
@@ -284,18 +284,18 @@ func (h *Handler) Pick(ctx context.Context, request *scheduling.InferenceRequest
 	if _, executed := profileResults[h.decodeProfile]; !executed {
 		decodeProfile, ok := profiles[h.decodeProfile]
 		if !ok {
-			span.SetAttributes(attribute.String("llm_d.profile_handler.decision", "error_missing_decode_profile"))
+			span.SetAttributes(attribute.String("llm_d.epp.profile_handler.decision", "error_missing_decode_profile"))
 			return map[string]scheduling.SchedulerProfile{}
 		}
-		span.SetAttributes(attribute.String("llm_d.profile_handler.decision", "run_decode"))
+		span.SetAttributes(attribute.String("llm_d.epp.profile_handler.decision", "run_decode"))
 		return map[string]scheduling.SchedulerProfile{h.decodeProfile: decodeProfile}
 	}
 
 	decodeRes := profileResults[h.decodeProfile]
 	if decodeRes == nil || len(decodeRes.TargetEndpoints) == 0 {
 		span.SetAttributes(
-			attribute.String("llm_d.profile_handler.decision", "complete"),
-			attribute.Bool("llm_d.profile_handler.decode_failed", true),
+			attribute.String("llm_d.epp.profile_handler.decision", "complete"),
+			attribute.Bool("llm_d.epp.profile_handler.decode_failed", true),
 		)
 		return map[string]scheduling.SchedulerProfile{}
 	}
@@ -304,25 +304,25 @@ func (h *Handler) Pick(ctx context.Context, request *scheduling.InferenceRequest
 	if _, hasEncodeProfile := profiles[h.encodeProfile]; hasEncodeProfile {
 		if _, executed := profileResults[h.encodeProfile]; !executed {
 			if h.encodeDecider != nil && h.encodeDecider.disaggregate(ctx, request, decodeRes.TargetEndpoints[0]) {
-				span.SetAttributes(attribute.String("llm_d.profile_handler.decision", "run_encode"))
+				span.SetAttributes(attribute.String("llm_d.epp.profile_handler.decision", "run_encode"))
 				return map[string]scheduling.SchedulerProfile{h.encodeProfile: profiles[h.encodeProfile]}
 			}
 			// Decider rejected encode - mark as evaluated so we don't re-run the decider.
 			profileResults[h.encodeProfile] = nil
-			span.SetAttributes(attribute.String("llm_d.profile_handler.decision", "skip_encode"))
+			span.SetAttributes(attribute.String("llm_d.epp.profile_handler.decision", "skip_encode"))
 		}
 	}
 
 	// ── Stage 3: Prefill (optional) ────────────────────────────────────────
 	if _, hasPrefillProfile := profiles[h.prefillProfile]; hasPrefillProfile {
 		if _, executed := profileResults[h.prefillProfile]; !executed {
 			if h.pdDecider != nil && h.pdDecider.disaggregate(ctx, request, decodeRes.TargetEndpoints[0]) {
-				span.SetAttributes(attribute.String("llm_d.profile_handler.decision", "run_prefill"))
+				span.SetAttributes(attribute.String("llm_d.epp.profile_handler.decision", "run_prefill"))
 				return map[string]scheduling.SchedulerProfile{h.prefillProfile: profiles[h.prefillProfile]}
 			}
 			// Decider rejected prefill - mark as evaluated so we don't re-run the decider.
 			profileResults[h.prefillProfile] = nil
-			span.SetAttributes(attribute.String("llm_d.profile_handler.decision", "skip_prefill"))
+			span.SetAttributes(attribute.String("llm_d.epp.profile_handler.decision", "skip_prefill"))
 		}
 	}
 
@@ -332,7 +332,7 @@ func (h *Handler) Pick(ctx context.Context, request *scheduling.InferenceRequest
 
 	decision := DisaggDecisionType(encodeUsed, prefillUsed)
 	RecordDisaggDecision(h.typedName.Name, h.typedName.Type, request.TargetModel, decision)
-	span.SetAttributes(attribute.String("llm_d.profile_handler.decision", "complete_"+decision))
+	span.SetAttributes(attribute.String("llm_d.epp.profile_handler.decision", "complete_"+decision))
 
 	return map[string]scheduling.SchedulerProfile{}
 }

diff --git a/pkg/epp/framework/plugins/scheduling/profilehandler/disagg/pd_profile_handler.go b/pkg/epp/framework/plugins/scheduling/profilehandler/disagg/pd_profile_handler.go
@@ -167,8 +167,8 @@ func (h *PdProfileHandler) Pick(ctx context.Context, request *scheduling.Inferen
 
 	// Set initial attributes
 	span.SetAttributes(
-		attribute.Int("llm_d.profile_handler.total_profiles", len(profiles)),
-		attribute.Int("llm_d.profile_handler.executed_profiles", len(profileResults)),
+		attribute.Int("llm_d.epp.profile_handler.total_profiles", len(profiles)),
+		attribute.Int("llm_d.epp.profile_handler.executed_profiles", len(profileResults)),
 	)
 
 	// Set optional request attributes if request is not nil
@@ -184,8 +184,8 @@ func (h *PdProfileHandler) Pick(ctx context.Context, request *scheduling.Inferen
 	if _, executed := profileResults[h.decodeProfile]; !executed {
 		// if decode profile was not executed yet, first let the scheduler run the decode profile
 		span.SetAttributes(
-			attribute.String("llm_d.profile_handler.decision", "run_decode"),
-			attribute.String("llm_d.profile_handler.selected_profile", h.decodeProfile),
+			attribute.String("llm_d.epp.profile_handler.decision", "run_decode"),
+			attribute.String("llm_d.epp.profile_handler.selected_profile", h.decodeProfile),
 		)
 		return map[string]scheduling.SchedulerProfile{
 			h.decodeProfile: profiles[h.decodeProfile],
@@ -197,8 +197,8 @@ func (h *PdProfileHandler) Pick(ctx context.Context, request *scheduling.Inferen
 	// check if all configured profiles have been executed, or if decode failed, no need to run more profiles.
 	if len(profiles) == len(profileResults) || profileResults[h.decodeProfile] == nil {
 		span.SetAttributes(
-			attribute.String("llm_d.profile_handler.decision", "complete"),
-			attribute.Bool("llm_d.profile_handler.decode_failed", profileResults[h.decodeProfile] == nil),
+			attribute.String("llm_d.epp.profile_handler.decision", "complete"),
+			attribute.Bool("llm_d.epp.profile_handler.decode_failed", profileResults[h.decodeProfile] == nil),
 		)
 		return map[string]scheduling.SchedulerProfile{}
 	}
@@ -207,8 +207,8 @@ func (h *PdProfileHandler) Pick(ctx context.Context, request *scheduling.Inferen
 		RecordPDDecision(h.typedName.Name, h.typedName.Type, request.TargetModel, DecisionTypePrefillDecode) //nolint:staticcheck // intentional: pd-profile-handler is itself deprecated
 		// run the prefill profile
 		span.SetAttributes(
-			attribute.String("llm_d.profile_handler.decision", "prefill_decode"),
-			attribute.String("llm_d.profile_handler.selected_profile", h.prefillProfile),
+			attribute.String("llm_d.epp.profile_handler.decision", "prefill_decode"),
+			attribute.String("llm_d.epp.profile_handler.selected_profile", h.prefillProfile),
 		)
 		return map[string]scheduling.SchedulerProfile{
 			h.prefillProfile: profiles[h.prefillProfile],
@@ -217,7 +217,7 @@ func (h *PdProfileHandler) Pick(ctx context.Context, request *scheduling.Inferen
 
 	RecordPDDecision(h.typedName.Name, h.typedName.Type, request.TargetModel, DecisionTypeDecodeOnly) //nolint:staticcheck // intentional: pd-profile-handler is itself deprecated
 	span.SetAttributes(
-		attribute.String("llm_d.profile_handler.decision", "decode_only"),
+		attribute.String("llm_d.epp.profile_handler.decision", "decode_only"),
 	)
 	return map[string]scheduling.SchedulerProfile{} // do not run prefill
 }