k8s-nim-operator/config/samples/nim/kserve/serverless/nimservice.yaml at cf8a64a1ce8a3178c2d18ef18f436732d6d7cec7 · NVIDIA/k8s-nim-operator · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
apiVersion: apps.nvidia.com/v1alpha1
kind: NIMService
metadata:
  name: meta-llama-3-2-1b-instruct
  namespace: nim-service
spec:
  image:
    repository: nvcr.io/nim/meta/llama-3.2-1b-instruct
    tag: "1.8"
    pullPolicy: IfNotPresent
    pullSecrets:
      - ngc-secret
  authSecret: ngc-api-secret
  storage:
    nimCache:
      name: meta-llama-3-2-1b-instruct
      profile: '4f904d571fe60ff24695b5ee2aa42da58cb460787a968f1e8a09f5a7e862728d'
  replicas: 1
  resources:
    limits:
      nvidia.com/gpu: 1
      cpu: "12"
      memory: 32Gi
    requests:
      nvidia.com/gpu: 1
      cpu: "12"
      memory: 32Gi
  expose:
    service:
      type: ClusterIP
      port: 8000
  tolerations:
    - effect: NoSchedule
      key: p4-gpu
      operator: Exists
  nodeSelector:
    node.kubernetes.io/instance-type: p4d.24xlarge
  livenessProbe:
    enabled: true
    probe:
      httpGet:
        path: /v1/models
        port: 8000
      initialDelaySeconds: 120
      timeoutSeconds: 300
      periodSeconds: 10
  readinessProbe:
    enabled: true
    probe:
      httpGet:
        path: /v1/models
        port: 8000
      initialDelaySeconds: 120
      timeoutSeconds: 300
      periodSeconds: 10
  startupProbe:
    enabled: true
    probe:
      httpGet:
        path: /v1/models
        port: 8000
      initialDelaySeconds: 120
      timeoutSeconds: 300
      periodSeconds: 10