opendatahub-io
diff --git a/‎deployment/kuadrant-openshift/01-kserve-config-openshift.yaml‎
Lines changed: 24 additions & 0 deletions b/‎deployment/kuadrant-openshift/01-kserve-config-openshift.yaml‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎deployment/kuadrant-openshift/08-token-rate-limit-policy.yaml‎
Lines changed: 34 additions & 0 deletions b/‎deployment/kuadrant-openshift/08-token-rate-limit-policy.yaml‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎deployment/kuadrant-openshift/09-token-rate-limit-servicemonitor-envoy-shim.yaml‎
Lines changed: 110 additions & 0 deletions b/‎deployment/kuadrant-openshift/09-token-rate-limit-servicemonitor-envoy-shim.yaml‎
Lines changed: 110 additions & 0 deletions
@@ -25,3 +25,27 @@ data:
       "cpuRequest": "200m",
       "cpuLimit": "1"
     }
+  logger: |
+    {
+      "image": "kserve/agent:v0.15.2",
+      "memoryRequest": "100Mi",
+      "memoryLimit": "1Gi",
+      "cpuRequest": "100m",
+      "cpuLimit": "1"
+    }
+  batcher: |
+    {
+      "image": "kserve/agent:v0.15.2",
+      "memoryRequest": "100Mi",
+      "memoryLimit": "1Gi",
+      "cpuRequest": "100m",
+      "cpuLimit": "1"
+    }
+  agent: |
+    {
+      "image": "kserve/agent:v0.15.2",
+      "memoryRequest": "100Mi",
+      "memoryLimit": "1Gi",
+      "cpuRequest": "100m",
+      "cpuLimit": "1"
+    }
@@ -0,0 +1,34 @@
+# Gateway-level Token Rate Limiting Policy for OpenShift
+# Automatically tracks tokens from response bodies (usage.total_tokens)
+# Uses the same user groups as the auth policy configuration
+---
+apiVersion: kuadrant.io/v1alpha1
+kind: TokenRateLimitPolicy
+metadata:
+  name: gateway-token-rate-limits
+  namespace: llm
+spec:
+  targetRef:
+    group: gateway.networking.k8s.io
+    kind: Gateway
+    name: inference-gateway
+  limits:
+    free-user-tokens:
+      rates:
+        - limit: 10000
+          window: 1m
+      when:
+        - predicate: |
+            auth.identity.groups.split(",").exists(g, g == "free")
+      counters:
+        - expression: auth.identity.userid
+    
+    premium-user-tokens:
+      rates:
+        - limit: 50000
+          window: 1m
+      when:
+        - predicate: |
+            auth.identity.groups.split(",").exists(g, g == "premium")
+      counters:
+        - expression: auth.identity.userid
@@ -0,0 +1,110 @@
+# Service and ServiceMonitor for Token Rate Limiting Metrics from Inference Gateway
+# Configures Prometheus scraping of Istio Envoy proxy metrics with token usage relabeling
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: inference-gateway-envoy-metrics
+  namespace: llm
+  labels:
+    app.kubernetes.io/component: inference-gateway
+    gateway.networking.k8s.io/gateway-name: inference-gateway
+spec:
+  selector:
+    gateway.networking.k8s.io/gateway-name: inference-gateway
+  ports:
+    - name: http-envoy-metrics
+      port: 15090
+      targetPort: 15090
+      protocol: TCP
+---
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: inference-gateway-envoy-metrics
+  namespace: llm
+  labels:
+    environment: base
+    project: models-aas-observability
+    app.kubernetes.io/component: inference-gateway
+    gateway.networking.k8s.io/gateway-name: inference-gateway
+spec:
+  namespaceSelector:
+    matchNames: [llm]
+  selector:
+    matchLabels:
+      app.kubernetes.io/component: inference-gateway
+      gateway.networking.k8s.io/gateway-name: inference-gateway
+  endpoints:
+    - port: http-envoy-metrics
+      path: /stats/prometheus
+      interval: 15s
+      scrapeTimeout: 10s
+      honorLabels: true
+      metricRelabelings:
+        # Extract labels from token_usage_* metrics
+        - action: replace
+          sourceLabels: [__name__]
+          regex: token_usage_with_user_and_group__user___([A-Za-z0-9_-]+)___group___([A-Za-z0-9_-]+)___namespace__([A-Za-z0-9_-]+).*
+          targetLabel: user
+          replacement: $1
+        - action: replace
+          sourceLabels: [__name__]
+          regex: token_usage_with_user_and_group__user___([A-Za-z0-9_-]+)___group___([A-Za-z0-9_-]+)___namespace__([A-Za-z0-9_-]+).*
+          targetLabel: group
+          replacement: $2
+        - action: replace
+          sourceLabels: [__name__]
+          regex: token_usage_with_user_and_group__user___([A-Za-z0-9_-]+)___group___([A-Za-z0-9_-]+)___namespace__([A-Za-z0-9_-]+).*
+          targetLabel: namespace
+          replacement: $3
+        # Rename token usage metrics to clean names
+        - action: replace
+          sourceLabels: [__name__]
+          regex: (token_usage_with_user_and_group)__.*$
+          targetLabel: __name__
+          replacement: $1
+        # Extract labels from authorized_* metrics
+        - action: replace
+          sourceLabels: [__name__]
+          regex: authorized_calls_with_user_and_group__user___([A-Za-z0-9_-]+)___group___([A-Za-z0-9_-]+)___namespace__([A-Za-z0-9_-]+)
+          targetLabel: user
+          replacement: $1
+        - action: replace
+          sourceLabels: [__name__]
+          regex: authorized_calls_with_user_and_group__user___([A-Za-z0-9_-]+)___group___([A-Za-z0-9_-]+)___namespace__([A-Za-z0-9_-]+)
+          targetLabel: group
+          replacement: $2
+        - action: replace
+          sourceLabels: [__name__]
+          regex: authorized_calls_with_user_and_group__user___([A-Za-z0-9_-]+)___group___([A-Za-z0-9_-]+)___namespace__([A-Za-z0-9_-]+)
+          targetLabel: namespace
+          replacement: $3
+        # Extract labels from limited_* metrics
+        - action: replace
+          sourceLabels: [__name__]
+          regex: limited_calls_with_user_and_group__user___([A-Za-z0-9_-]+)___group___([A-Za-z0-9_-]+)___namespace__([A-Za-z0-9_-]+)
+          targetLabel: user
+          replacement: $1
+        - action: replace
+          sourceLabels: [__name__]
+          regex: limited_calls_with_user_and_group__user___([A-Za-z0-9_-]+)___group___([A-Za-z0-9_-]+)___namespace__([A-Za-z0-9_-]+)
+          targetLabel: group
+          replacement: $2
+        - action: replace
+          sourceLabels: [__name__]
+          regex: limited_calls_with_user_and_group__user___([A-Za-z0-9_-]+)___group___([A-Za-z0-9_-]+)___namespace__([A-Za-z0-9_-]+)
+          targetLabel: namespace
+          replacement: $3
+        # Rename call metrics to clean names
+        - action: replace
+          sourceLabels: [__name__]
+          regex: (authorized_calls_with_user_and_group)__.*$
+          targetLabel: __name__
+          replacement: $1
+        - action: replace
+          sourceLabels: [__name__]
+          regex: (limited_calls_with_user_and_group)__.*$
+          targetLabel: __name__
+          replacement: $1
+