Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
55 commits
Select commit Hold shift + click to select a range
0041d31
AI deployment init
SebastianGode May 5, 2026
5a23f09
fix applications
SebastianGode May 5, 2026
c17cac5
SFS
SebastianGode May 5, 2026
2add91e
SFS
SebastianGode May 5, 2026
bfa11e9
different label
SebastianGode May 5, 2026
9885930
control plane url
SebastianGode May 6, 2026
1f34bd7
Update litellm
SebastianGode May 6, 2026
3cccf9a
2 replicas
SebastianGode May 6, 2026
8935f23
Satefulset for 2 replicas
SebastianGode May 6, 2026
15e4f0c
loadbalancing
SebastianGode May 6, 2026
5249374
Ip whitelisting
SebastianGode May 6, 2026
5bd865f
Ip whitelisting revert
SebastianGode May 6, 2026
c692f2c
Ip whitelisting
SebastianGode May 6, 2026
d6f2b2d
Ip whitelisting
SebastianGode May 6, 2026
64a9720
Ip whitelisting
SebastianGode May 6, 2026
873156f
other loadbalancing
SebastianGode May 6, 2026
a9829d3
Fix ip
SebastianGode May 6, 2026
1c83004
performance
SebastianGode May 6, 2026
9ea37d8
performance
SebastianGode May 8, 2026
1c217ca
more performance
SebastianGode May 8, 2026
d216e18
Images
SebastianGode May 8, 2026
9e4f4a0
Loadbalancing changes
SebastianGode May 12, 2026
f6441c7
more flags
SebastianGode May 13, 2026
2cf4e24
Enable CRAM for faster prompt processing
SebastianGode May 13, 2026
1639923
fix spelling
SebastianGode May 13, 2026
4e2d537
more cache
SebastianGode May 13, 2026
ad1119f
newer llamacpp
SebastianGode May 13, 2026
3a7d2ff
newer llamacpp
SebastianGode May 13, 2026
778ad6d
more replicas
SebastianGode May 18, 2026
6749655
fix llamacpp timeout on long context
SebastianGode May 19, 2026
f373175
Litellm routing adjustment
SebastianGode May 19, 2026
8f9a48e
Fix llama context splitting
SebastianGode May 19, 2026
450a5fb
parallel 2
SebastianGode May 22, 2026
5ad5085
replicas 3
SebastianGode May 22, 2026
e1afe19
Whitelist testip
SebastianGode May 27, 2026
bb19e45
nginx timeout
SebastianGode May 27, 2026
7679fec
Fix 413 error
SebastianGode May 29, 2026
9cc3bfb
Update llamacpp
SebastianGode Jun 1, 2026
75638ee
fix version
SebastianGode Jun 1, 2026
cd426c5
Add max token output to 32000
SebastianGode Jun 1, 2026
a4a1fcb
new toleration policy
SebastianGode Jun 2, 2026
cb60c74
enable metrics
SebastianGode Jun 2, 2026
30df684
disable auth temporarily
SebastianGode Jun 2, 2026
0833fad
update litellm
SebastianGode Jun 2, 2026
78bdcf1
Update llamacpp image
SebastianGode Jun 3, 2026
513bbec
secret pull
SebastianGode Jun 3, 2026
b26db26
secret pull
SebastianGode Jun 3, 2026
c0627bb
litellm from swr
SebastianGode Jun 3, 2026
b79f878
Enable MTP
SebastianGode Jun 3, 2026
8b580e5
Disable mmproj
SebastianGode Jun 3, 2026
10a3660
Enable mmproj
SebastianGode Jun 3, 2026
30029b5
more RAM
SebastianGode Jun 8, 2026
130bf7a
more nodes
SebastianGode Jun 9, 2026
df835f1
remove deployment affinity
SebastianGode Jun 10, 2026
bd42003
Added deployment affinity
SebastianGode Jun 10, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -486,6 +486,40 @@ applications:
syncOptions:
- CreateNamespace=true

- name: litellm
clusters: [preprod]
config:
namespace: litellm
repoURL: 'https://github.com/opentelekomcloud-infra/system-config.git'
targetRevision: 'litellm'
path: kubernetes/kustomize/litellm/overlays/preprod/
pluginName: argocd-vault-plugin-kustomize
pluginEnv: '.'
project: infra
syncPolicy:
automated:
prune: true
selfHeal: true
syncOptions:
- CreateNamespace=true

- name: llamacpp
clusters: [preprod]
config:
namespace: llamacpp
repoURL: 'https://github.com/opentelekomcloud-infra/system-config.git'
targetRevision: 'litellm'
path: kubernetes/kustomize/llamacpp/overlays/preprod/
pluginName: argocd-vault-plugin-kustomize
pluginEnv: '.'
project: infra
syncPolicy:
automated:
prune: true
selfHeal: true
syncOptions:
- CreateNamespace=true

- name: mcaptcha
clusters: [preprod]
config:
Expand Down
27 changes: 27 additions & 0 deletions kubernetes/kustomize/litellm/base/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: litellm-config
data:
config.yaml: |
general_settings:
control_plane_url: "https://litellm.eco-preprod.tsi-dev.otc-service.com"

litellm_settings:
num_retries: 4
request_timeout: 6000
callbacks: ["prometheus"]
require_auth_for_metrics_endpoint: true

router_settings:
routing_strategy: least-busy
num_retries: 4
timeout: 6000

optional_pre_call_checks:
- deployment_affinity

deployment_affinity_ttl_seconds: 60

allowed_fails: 4
cooldown_time: 30
50 changes: 50 additions & 0 deletions kubernetes/kustomize/litellm/base/deployment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: litellm-proxy
labels:
app: litellm
spec:
replicas: 1
selector:
matchLabels:
app: litellm
template:
metadata:
labels:
app: litellm
spec:
containers:
- name: litellm
image: swr.eu-de.otc.t-systems.com/opentelekomcloud/litellm:v1.87.0
args:
- "--config"
- "/app/config.yaml"
- "--port"
- "4000"
env:
- name: LITELLM_MASTER_KEY
value: "<path:secret/data/litellm/api#master-key>"
- name: DATABASE_URL
value: "<path:secret/data/litellm/postgres#database-url>"
- name: STORE_MODEL_IN_DB
value: "True"
ports:
- name: http
containerPort: 4000
protocol: TCP
resources:
requests:
cpu: "500m"
memory: "512Mi"
limits:
cpu: "2"
memory: "2Gi"
volumeMounts:
- name: config
mountPath: /app/config.yaml
subPath: config.yaml
volumes:
- name: config
configMap:
name: litellm-config
7 changes: 7 additions & 0 deletions kubernetes/kustomize/litellm/base/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization

resources:
- config.yaml
- deployment.yaml
- service.yaml
15 changes: 15 additions & 0 deletions kubernetes/kustomize/litellm/base/service.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
apiVersion: v1
kind: Service
metadata:
name: litellm-service
labels:
app: litellm
spec:
type: ClusterIP
selector:
app: litellm
ports:
- name: http
port: 4000
targetPort: 4000
protocol: TCP
29 changes: 29 additions & 0 deletions kubernetes/kustomize/litellm/overlays/preprod/ingress.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: litellm
annotations:
cert-manager.io/cluster-issuer: letsencrypt-prod
nginx.ingress.kubernetes.io/backend-protocol: "HTTP"
nginx.ingress.kubernetes.io/whitelist-source-range: "80.158.0.0/16,164.30.0.0/16,62.156.0.0/14,79.152.111.178/32"
nginx.ingress.kubernetes.io/proxy-read-timeout: "3600"
nginx.ingress.kubernetes.io/proxy-send-timeout: "3600"
nginx.ingress.kubernetes.io/proxy-connect-timeout: "3600"
nginx.ingress.kubernetes.io/proxy-body-size: "1g"
spec:
ingressClassName: nginx
rules:
- host: litellm.eco-preprod.tsi-dev.otc-service.com
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: litellm-service
port:
number: 4000
tls:
- hosts:
- litellm.eco-preprod.tsi-dev.otc-service.com
secretName: litellm-tls
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization

namespace: litellm

resources:
- ../../base
- ingress.yaml
15 changes: 15 additions & 0 deletions kubernetes/kustomize/llamacpp/base/headless-service.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
apiVersion: v1
kind: Service
metadata:
name: llamacpp-headless
labels:
app: llamacpp
spec:
clusterIP: None
selector:
app: llamacpp
ports:
- name: http
port: 8000
targetPort: 8080
protocol: TCP
7 changes: 7 additions & 0 deletions kubernetes/kustomize/llamacpp/base/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization

resources:
- headless-service.yaml
- statefulset.yaml
- models-pvc.yaml
12 changes: 12 additions & 0 deletions kubernetes/kustomize/llamacpp/base/models-pvc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: llamacpp-models
spec:
accessModes:
- ReadWriteMany
storageClassName: csi-sfs
volumeName: pv-sfs-llamacpp
resources:
requests:
storage: 50Gi
124 changes: 124 additions & 0 deletions kubernetes/kustomize/llamacpp/base/statefulset.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: llamacpp-server
labels:
app: llamacpp
spec:
serviceName: llamacpp-headless
replicas: 5
podManagementPolicy: Parallel
selector:
matchLabels:
app: llamacpp
template:
metadata:
labels:
app: llamacpp
spec:
imagePullSecrets:
- name: default-secret
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: accelerator
operator: In
values:
- nvidia-l4
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchLabels:
app: llamacpp
topologyKey: kubernetes.io/hostname
containers:
- name: llamacpp
image: swr.eu-de.otc.t-systems.com/opentelekomcloud/llama.cpp:server-cuda-b9487
env:
- name: LLAMA_API_KEY
value: "<path:secret/data/llamacpp/auth#api-key>"
args:
- --host
- "0.0.0.0"
- --port
- "8080"
- -m
- "/models/Qwen3.6-35B-A3B-UD-Q3_K_XL-MTP.gguf"
- --mmproj
- "/models/mmproj-BF16-qwen3.6-35b-MTP.gguf"
- --spec-type
- "draft-mtp"
- --spec-draft-n-max
- "2"
- -c
- "262144"
- -ngl
- "999"
- --cache-type-k
- q8_0
- --cache-type-v
- q8_0
- --temp
- "0.6"
- --top-p
- "0.95"
- --top-k
- "20"
- --min-p
- "0.00"
- --jinja
- --metrics
- --no-mmap
- --cont-batching
- --flash-attn
- "on"
- --parallel
- "2"
- --alias
- qwen3.6-35b
- --threads
- "4"
- --threads-batch
- "4"
- --ubatch-size
- "1024"
- --batch-size
- "2048"
- --swa-full
- --kv-unified
- --cache-idle-slots
- -cram
- "10000"
- --timeout
- "1200"
- -n
- "32000"
ports:
- name: http
containerPort: 8080
protocol: TCP
resources:
requests:
nvidia.com/gpu: "1"
cpu: "4"
memory: "25Gi"
limits:
nvidia.com/gpu: "1"
cpu: "6"
memory: "31Gi"
volumeMounts:
- name: models
mountPath: /models
volumes:
- name: models
persistentVolumeClaim:
claimName: llamacpp-models
tolerations:
- key: "nvidia.com/gpu"
operator: "Equal"
value: "true"
effect: "NoSchedule"
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization

namespace: llamacpp

resources:
- ../../base
Loading