Skip to content

Commit a5cf37c

Browse files
committed
take zero cache hits into account.
1 parent 164a757 commit a5cf37c

File tree

4 files changed

+10
-5
lines changed

4 files changed

+10
-5
lines changed

config/charts/inferencepool/templates/epp-config.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,11 @@ data:
1515
- name: default
1616
plugins:
1717
- pluginRef: queue-scorer
18+
weight: 2
1819
- pluginRef: kv-cache-utilization-scorer
20+
weight: 2
1921
- pluginRef: prefix-cache-scorer
22+
weight: 3
2023
{{- if (hasKey .Values.inferenceExtension "pluginsCustomConfig") }}
2124
{{- .Values.inferenceExtension.pluginsCustomConfig | toYaml | nindent 2 }}
2225
{{- end }}

pkg/epp/handlers/response.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,9 +86,11 @@ func (s *StreamingServer) HandleResponseBodyModelStreaming(ctx context.Context,
8686
reqCtx.Usage = resp.Usage
8787
metrics.RecordInputTokens(reqCtx.IncomingModelName, reqCtx.TargetModelName, resp.Usage.PromptTokens)
8888
metrics.RecordOutputTokens(reqCtx.IncomingModelName, reqCtx.TargetModelName, resp.Usage.CompletionTokens)
89+
cachedToken := 0
8990
if resp.Usage.PromptTokenDetails != nil {
90-
metrics.RecordPromptCachedTokens(reqCtx.IncomingModelName, reqCtx.TargetModelName, resp.Usage.PromptTokenDetails.CachedTokens)
91+
cachedToken = resp.Usage.PromptTokenDetails.CachedTokens
9192
}
93+
metrics.RecordPromptCachedTokens(reqCtx.IncomingModelName, reqCtx.TargetModelName, cachedToken)
9294
_, err := s.director.HandleResponseBodyComplete(ctx, reqCtx)
9395
if err != nil {
9496
logger.Error(err, "error in HandleResponseBodyComplete")

pkg/epp/handlers/server.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -316,9 +316,11 @@ func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer)
316316
metrics.RecordResponseSizes(reqCtx.IncomingModelName, reqCtx.TargetModelName, reqCtx.ResponseSize)
317317
metrics.RecordInputTokens(reqCtx.IncomingModelName, reqCtx.TargetModelName, reqCtx.Usage.PromptTokens)
318318
metrics.RecordOutputTokens(reqCtx.IncomingModelName, reqCtx.TargetModelName, reqCtx.Usage.CompletionTokens)
319+
cachedToken := 0
319320
if reqCtx.Usage.PromptTokenDetails != nil {
320-
metrics.RecordPromptCachedTokens(reqCtx.IncomingModelName, reqCtx.TargetModelName, reqCtx.Usage.PromptTokenDetails.CachedTokens)
321+
cachedToken = reqCtx.Usage.PromptTokenDetails.CachedTokens
321322
}
323+
metrics.RecordPromptCachedTokens(reqCtx.IncomingModelName, reqCtx.TargetModelName, cachedToken)
322324
}
323325
}
324326
}

pkg/epp/metrics/metrics.go

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -384,9 +384,7 @@ func RecordOutputTokens(modelName, targetModelName string, size int) {
384384

385385
// RecordPromptCachedTokens records prompt cached tokens count.
386386
func RecordPromptCachedTokens(modelName, targetModelName string, size int) {
387-
if size > 0 {
388-
promptCachedTokens.WithLabelValues(modelName, targetModelName).Observe(float64(size))
389-
}
387+
promptCachedTokens.WithLabelValues(modelName, targetModelName).Observe(float64(size))
390388
}
391389

392390
// RecordNormalizedTimePerOutputToken (NTPOT) records the normalized time per output token.

0 commit comments

Comments
 (0)