opentelekomcloud-infra · SebastianGode · May 5, 2026 · May 5, 2026 · May 5, 2026 · May 5, 2026
@@ -486,6 +486,40 @@ applications:
         syncOptions:
           - CreateNamespace=true
 
+  - name: litellm
+    clusters: [preprod]
+    config:
+      namespace: litellm
+      repoURL: 'https://github.com/opentelekomcloud-infra/system-config.git'
+      targetRevision: 'litellm'
+      path: kubernetes/kustomize/litellm/overlays/preprod/
+      pluginName: argocd-vault-plugin-kustomize
+      pluginEnv: '.'
+      project: infra
+      syncPolicy:
+        automated:
+          prune: true
+          selfHeal: true
+        syncOptions:
+          - CreateNamespace=true
+
+  - name: llamacpp
+    clusters: [preprod]
+    config:
+      namespace: llamacpp
+      repoURL: 'https://github.com/opentelekomcloud-infra/system-config.git'
+      targetRevision: 'litellm'
+      path: kubernetes/kustomize/llamacpp/overlays/preprod/
+      pluginName: argocd-vault-plugin-kustomize
+      pluginEnv: '.'
+      project: infra
+      syncPolicy:
+        automated:
+          prune: true
+          selfHeal: true
+        syncOptions:
+          - CreateNamespace=true
+
   - name: mcaptcha
     clusters: [preprod]
     config:

@@ -0,0 +1,27 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: litellm-config
+data:
+  config.yaml: |
+    general_settings:
+      control_plane_url: "https://litellm.eco-preprod.tsi-dev.otc-service.com"
+
+    litellm_settings:
+      num_retries: 4
+      request_timeout: 6000
+      callbacks: ["prometheus"]
+      require_auth_for_metrics_endpoint: true
+
+    router_settings:
+      routing_strategy: least-busy
+      num_retries: 4
+      timeout: 6000
+
+      optional_pre_call_checks:
+        - deployment_affinity
+
+      deployment_affinity_ttl_seconds: 60
+
+      allowed_fails: 4
+      cooldown_time: 30
@@ -0,0 +1,50 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: litellm-proxy
+  labels:
+    app: litellm
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: litellm
+  template:
+    metadata:
+      labels:
+        app: litellm
+    spec:
+      containers:
+        - name: litellm
+          image: swr.eu-de.otc.t-systems.com/opentelekomcloud/litellm:v1.87.0
+          args:
+            - "--config"
+            - "/app/config.yaml"
+            - "--port"
+            - "4000"
+          env:
+            - name: LITELLM_MASTER_KEY
+              value: "<path:secret/data/litellm/api#master-key>"
+            - name: DATABASE_URL
+              value: "<path:secret/data/litellm/postgres#database-url>"
+            - name: STORE_MODEL_IN_DB
+              value: "True"
+          ports:
+            - name: http
+              containerPort: 4000
+              protocol: TCP
+          resources:
+            requests:
+              cpu: "500m"
+              memory: "512Mi"
+            limits:
+              cpu: "2"
+              memory: "2Gi"
+          volumeMounts:
+            - name: config
+              mountPath: /app/config.yaml
+              subPath: config.yaml
+      volumes:
+        - name: config
+          configMap:
+            name: litellm-config
@@ -0,0 +1,7 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources:
+  - config.yaml
+  - deployment.yaml
+  - service.yaml
@@ -0,0 +1,15 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: litellm-service
+  labels:
+    app: litellm
+spec:
+  type: ClusterIP
+  selector:
+    app: litellm
+  ports:
+    - name: http
+      port: 4000
+      targetPort: 4000
+      protocol: TCP
@@ -0,0 +1,29 @@
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: litellm
+  annotations:
+    cert-manager.io/cluster-issuer: letsencrypt-prod
+    nginx.ingress.kubernetes.io/backend-protocol: "HTTP"
+    nginx.ingress.kubernetes.io/whitelist-source-range: "80.158.0.0/16,164.30.0.0/16,62.156.0.0/14,79.152.111.178/32"
+    nginx.ingress.kubernetes.io/proxy-read-timeout: "3600"
+    nginx.ingress.kubernetes.io/proxy-send-timeout: "3600"
+    nginx.ingress.kubernetes.io/proxy-connect-timeout: "3600"
+    nginx.ingress.kubernetes.io/proxy-body-size: "1g"
+spec:
+  ingressClassName: nginx
+  rules:
+    - host: litellm.eco-preprod.tsi-dev.otc-service.com
+      http:
+        paths:
+          - path: /
+            pathType: Prefix
+            backend:
+              service:
+                name: litellm-service
+                port:
+                  number: 4000
+  tls:
+    - hosts:
+        - litellm.eco-preprod.tsi-dev.otc-service.com
+      secretName: litellm-tls
@@ -0,0 +1,8 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+namespace: litellm
+
+resources:
+  - ../../base
+  - ingress.yaml
@@ -0,0 +1,15 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: llamacpp-headless
+  labels:
+    app: llamacpp
+spec:
+  clusterIP: None
+  selector:
+    app: llamacpp
+  ports:
+    - name: http
+      port: 8000
+      targetPort: 8080
+      protocol: TCP
@@ -0,0 +1,7 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources:
+  - headless-service.yaml
+  - statefulset.yaml
+  - models-pvc.yaml
@@ -0,0 +1,12 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: llamacpp-models
+spec:
+  accessModes:
+    - ReadWriteMany
+  storageClassName: csi-sfs
+  volumeName: pv-sfs-llamacpp
+  resources:
+    requests:
+      storage: 50Gi
@@ -0,0 +1,124 @@
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: llamacpp-server
+  labels:
+    app: llamacpp
+spec:
+  serviceName: llamacpp-headless
+  replicas: 5
+  podManagementPolicy: Parallel
+  selector:
+    matchLabels:
+      app: llamacpp
+  template:
+    metadata:
+      labels:
+        app: llamacpp
+    spec:
+      imagePullSecrets:
+        - name: default-secret
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                  - key: accelerator
+                    operator: In
+                    values:
+                      - nvidia-l4
+        podAntiAffinity:
+          preferredDuringSchedulingIgnoredDuringExecution:
+            - weight: 100
+              podAffinityTerm:
+                labelSelector:
+                  matchLabels:
+                    app: llamacpp
+                topologyKey: kubernetes.io/hostname
+      containers:
+        - name: llamacpp
+          image: swr.eu-de.otc.t-systems.com/opentelekomcloud/llama.cpp:server-cuda-b9487
+          env:
+            - name: LLAMA_API_KEY
+              value: "<path:secret/data/llamacpp/auth#api-key>"
+          args:
+            - --host
+            - "0.0.0.0"
+            - --port
+            - "8080"
+            - -m
+            - "/models/Qwen3.6-35B-A3B-UD-Q3_K_XL-MTP.gguf"
+            - --mmproj
+            - "/models/mmproj-BF16-qwen3.6-35b-MTP.gguf"
+            - --spec-type
+            - "draft-mtp"
+            - --spec-draft-n-max
+            - "2"
+            - -c
+            - "262144"
+            - -ngl
+            - "999"
+            - --cache-type-k
+            - q8_0
+            - --cache-type-v
+            - q8_0
+            - --temp
+            - "0.6"
+            - --top-p
+            - "0.95"
+            - --top-k
+            - "20"
+            - --min-p
+            - "0.00"
+            - --jinja
+            - --metrics
+            - --no-mmap
+            - --cont-batching
+            - --flash-attn
+            - "on"
+            - --parallel
+            - "2"
+            - --alias
+            - qwen3.6-35b
+            - --threads
+            - "4"
+            - --threads-batch
+            - "4"
+            - --ubatch-size
+            - "1024"
+            - --batch-size
+            - "2048"
+            - --swa-full
+            - --kv-unified
+            - --cache-idle-slots
+            - -cram
+            - "10000"
+            - --timeout
+            - "1200"
+            - -n
+            - "32000"
+          ports:
+            - name: http
+              containerPort: 8080
+              protocol: TCP
+          resources:
+            requests:
+              nvidia.com/gpu: "1"
+              cpu: "4"
+              memory: "25Gi"
+            limits:
+              nvidia.com/gpu: "1"
+              cpu: "6"
+              memory: "31Gi"
+          volumeMounts:
+            - name: models
+              mountPath: /models
+      volumes:
+        - name: models
+          persistentVolumeClaim:
+            claimName: llamacpp-models
+      tolerations:
+        - key: "nvidia.com/gpu"
+          operator: "Equal"
+          value: "true"
+          effect: "NoSchedule"
@@ -0,0 +1,7 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+namespace: llamacpp
+
+resources:
+  - ../../base