llm-d · github-actions · Feb 10, 2026 · Jan 4, 2026 · Jan 5, 2026 · Jan 6, 2026
diff --git a/deploy/config/pd-epp-config.yaml b/deploy/config/pd-epp-config.yaml
@@ -1,23 +1,39 @@
 # Sample EPP configuration for tunning with P/D
 apiVersion: inference.networking.x-k8s.io/v1alpha1
 kind: EndpointPickerConfig
+featureGates:
+- prepareDataPlugins
 plugins:
 - type: prefill-header-handler
 - type: prefix-cache-scorer
+  parameters:
+    maxPrefixBlocksToMatch: 256
+    lruCapacityPerServer: 31250
+- type: queue-scorer
 - type: prefill-filter
 - type: decode-filter
 - type: max-score-picker
+- type: prefix-based-pd-decider
+  parameters:
+    nonCachedTokens: 16
 - type: pd-profile-handler
+  parameters:
+    primaryPort: ${PRIMARY_PORT}
+    deciderPluginName: prefix-based-pd-decider
 schedulingProfiles:
 - name: prefill
   plugins:
   - pluginRef: prefill-filter
   - pluginRef: max-score-picker
   - pluginRef: prefix-cache-scorer
     weight: 2
+  - pluginRef: queue-scorer
+    weight: 1
 - name: decode
   plugins:
   - pluginRef: decode-filter
   - pluginRef: max-score-picker
   - pluginRef: prefix-cache-scorer
     weight: 2
+  - pluginRef: queue-scorer
+    weight: 1
diff --git a/deploy/config/sim-pd-epp-config.yaml b/deploy/config/sim-pd-epp-config.yaml
@@ -2,31 +2,41 @@
 # Use with small hash block size for simulation purposes
 apiVersion: inference.networking.x-k8s.io/v1alpha1
 kind: EndpointPickerConfig
+featureGates:
+- prepareDataPlugins
 plugins:
 - type: prefill-header-handler
 - type: prefix-cache-scorer
   parameters:
-    hashBlockSize: 5
+    blockSizeTokens: 16
+    autoTune: false
     maxPrefixBlocksToMatch: 256
     lruCapacityPerServer: 31250
+- type: queue-scorer
 - type: prefill-filter
 - type: decode-filter
 - type: max-score-picker
+- type: prefix-based-pd-decider
+  parameters:
+    nonCachedTokens: 16
 - type: pd-profile-handler
   parameters:
-    threshold: 10
-    hashBlockSize: 5
     primaryPort: ${PRIMARY_PORT}
+    deciderPluginName: prefix-based-pd-decider
 schedulingProfiles:
 - name: prefill
   plugins:
   - pluginRef: prefill-filter
   - pluginRef: max-score-picker
   - pluginRef: prefix-cache-scorer
     weight: 2
+  - pluginRef: queue-scorer
+    weight: 1
 - name: decode
   plugins:
   - pluginRef: decode-filter
   - pluginRef: max-score-picker
   - pluginRef: prefix-cache-scorer
     weight: 2
+  - pluginRef: queue-scorer
+    weight: 1
diff --git a/docs/architecture.md b/docs/architecture.md
@@ -206,15 +206,41 @@ Selects the profiles to use when running with disaggregated prefill/decode
 
 - **Type**: `pd-profile-handler`
 - **Parameters**:
-  - `threshold`: specifies the threshold at which there are enough new input tokens to send the request to prefill and then decode, vs just to decode.
-  - `hashBlockSize`: specifies the length of the prompt chunk that a block is keyed by. This must the same value used for the PrefixCachePlugin.
   - `decodeProfile`: specifies the name of the profile used for the decode scheduling. Only needed if the decode profile is not named `decode`.
   - `prefillProfile`: specifies the name of the profile used for the prefill scheduling. Only needed if the prefill profile is not named `prefill`.
+  - `deciderPluginName`: specifies the name of the decider plugin. Decider determines whether disaggregated PD should be executed
+  - `primaryPort`: the base port number used for data parallel communication.
 
 **Note:** When using this plugin you must also have a PrefixCachePlugin configured in the prefill and decode scheduling profiles.
 
 ---
 
+#### Prefix Based Decider Plugin
+
+Type: `prefix-based-pd-decider`
+
+**Parameters**
+- `nonCachedTokens`: length, in token, of the uncached part of the user input above which disaggregated PD is triggered.
+
+Note: `prepareDataPlugins` feature gate should be enabled
+
+**Example**
+```yaml
+kind: EndpointPickerConfig
+featureGates:
+- prepareDataPlugins
+plugins:
+- type: prefix-based-pd-decider
+  parameters:
+    nonCachedTokens: 4
+- type: pd-profile-handler
+  parameters:
+    primaryPort: 8000
+    deciderPluginName: prefix-based-pd-decider
+```
+
+---
+
 #### ByLabelSelector
 
 Filters out pods using a standard Kubernetes label selector.

diff --git a/docs/disagg_pd.md b/docs/disagg_pd.md
@@ -155,6 +155,8 @@ Below is a minimal `EndpointPickerConfig` that enables integration with workload
 ```yaml
 apiVersion: inference.networking.x-k8s.io/v1alpha1
 kind: EndpointPickerConfig
+featureGates:
+- prepareDataPlugins
 plugins:
   # Prefill selection: match Pods with label role=prefill
   - type: by-label
@@ -176,10 +178,12 @@ plugins:
       lruCapacityPerServer: 31250
   - type: max-score-picker
   - type: prefill-header-handler
-  - type: pd-profile-handler
+  - type: prefix-based-pd-decider
     parameters:
-      threshold: 0
-      hashBlockSize: 5
+      nonCachedTokens: 8
+  - type: pd-profile-handler
+    parameters:    
+      deciderPluginName: prefix-based-pd-decider
       primaryPort: 8000
 schedulingProfiles:
   - name: prefill
@@ -200,6 +204,59 @@ schedulingProfiles:
 
 ![Disaggregated Prefill/Decode Architecture](./images/dp_architecture.png)
 
+--- 
+## PD Deciders
+
+PD deciders are pd handler plugins responsible for determining whether disaggregated P/D should be executed for a given request, based on the properties of the request prompt.
+
+
+### Prefix-Based PD Decider
+
+The `prefix-based-pd-decider` plugin makes the disaggregation decision according to the length of the non-cached suffix of the prompt relative to tokens already cached on the selected decode pod.
+
+**How It Works**
+- Once a decode pod is selected, the decider checks how many tokens from the incoming prompt have already been sent to this pod
+
+- If the remaining non-cached suffix length is longer than the configured threshold (nonCachedTokens), disaggregation is triggered — the prefill will run remotely on a prefill pod, and decode locally on the decode pod
+
+- If the non-cached suffix is shorter or equal to the threshold, the full request runs locally on the decode worker without remote prefill
+
+**Configuration**
+```yaml
+- type: prefix-based-pd-decider
+  parameters:
+    nonCachedTokens: 8
+```
+
+**Parameter:**
+
+- `nonCachedTokens`: Number of non-cached tokens that trigger disaggregation
+  - If set to 0, disaggregation always occurs for all requests
+
+**Feature Gate Requirement**
+To activate this decider, ensure the following feature gate is enabled in your EndpointPickerConfig
+
+```yaml
+featureGates:
+- prepareDataPlugins
+```
+
+
+### Always-Disagg PD Decider
+The `always-disagg-pd-decider` is a simpler alternative used mainly for testing or benchmarking.
+It always triggers disaggregation, regardless of prefix cache state or prompt characteristics.
+
+**Configuration example:**
+
+```yaml
+- type: always-disagg-pd-decider
+```
+
+**Notes:**
+This plugin accepts no parameters.
+
+It’s useful for validating end-to-end prefill/decode splitting and comparing system performance under forced disaggregation.
+
 ---
 
 ## References
diff --git a/pkg/plugins/profile/always_disagg_decider.go b/pkg/plugins/profile/always_disagg_decider.go
@@ -0,0 +1,48 @@
+package profile
+
+import (
+	"context"
+	"encoding/json"
+
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/framework/interface/plugin"
+	"sigs.k8s.io/gateway-api-inference-extension/pkg/epp/framework/interface/scheduling"
+)
+
+const (
+	// AlwaysDisaggDeciderPluginType is the type-name of the alwaysDisaggPDDecider plugin.
+	AlwaysDisaggDeciderPluginType = "always-disagg-pd-decider"
+)
+
+// compile-time type assertion
+var _ pdDeciderPlugin = &AlwaysDisaggPDDecider{}
+
+// AlwaysDisaggPDDecider is a PD decider plugin which always decide to disaggregate PD
+type AlwaysDisaggPDDecider struct {
+	typedName plugin.TypedName
+}
+
+// AlwaysDisaggPDDeciderPluginFactory defines the factory function for creating
+// a new instance of the AlwaysDisaggPDDecider.
+func AlwaysDisaggPDDeciderPluginFactory(name string, _ json.RawMessage,
+	_ plugin.Handle) (plugin.Plugin, error) {
+	return newAlwaysDisaggPDDecider().WithName(name), nil
+}
+
+func newAlwaysDisaggPDDecider() *AlwaysDisaggPDDecider {
+	return &AlwaysDisaggPDDecider{}
+}
+
+// TypedName returns the typed name of the plugin.
+func (d *AlwaysDisaggPDDecider) TypedName() plugin.TypedName {
+	return d.typedName
+}
+
+// WithName sets the name of the plugin.
+func (d *AlwaysDisaggPDDecider) WithName(name string) *AlwaysDisaggPDDecider {
+	d.typedName.Name = name
+	return d
+}
+
+func (d *AlwaysDisaggPDDecider) disaggregate(ctx context.Context, inputTokens int, endpoint scheduling.Endpoint) bool {
+	return true
+}