llm-d
diff --git a/‎helm/templates/NOTES.txt‎
Lines changed: 0 additions & 22 deletions b/‎helm/templates/NOTES.txt‎
Lines changed: 0 additions & 22 deletions
diff --git a/‎helm/templates/_helpers.tpl‎
Lines changed: 16 additions & 4 deletions b/‎helm/templates/_helpers.tpl‎
Lines changed: 16 additions & 4 deletions
diff --git a/‎helm/templates/epp-deployment-mk.yaml‎
Lines changed: 91 additions & 0 deletions b/‎helm/templates/epp-deployment-mk.yaml‎
Lines changed: 91 additions & 0 deletions
diff --git a/‎helm/templates/epp-sa.yaml‎
Lines changed: 13 additions & 0 deletions b/‎helm/templates/epp-sa.yaml‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎helm/templates/epp-service.yaml‎
Lines changed: 15 additions & 0 deletions b/‎helm/templates/epp-service.yaml‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎helm/templates/service.yaml‎
Lines changed: 0 additions & 15 deletions b/‎helm/templates/service.yaml‎
Lines changed: 0 additions & 15 deletions
diff --git a/‎helm/templates/tests/test-connection.yaml‎
Lines changed: 0 additions & 15 deletions b/‎helm/templates/tests/test-connection.yaml‎
Lines changed: 0 additions & 15 deletions
diff --git a/‎helm/values-msvc-mk.yaml‎
Lines changed: 152 additions & 0 deletions b/‎helm/values-msvc-mk.yaml‎
Lines changed: 152 additions & 0 deletions
@@ -35,27 +35,39 @@ Common labels
 */}}
 {{- define "llm-d-modelservice.labels" -}}
 helm.sh/chart: {{ include "llm-d-modelservice.chart" . }}
-{{ include "llm-d-modelservice.selectorLabels" . }}
+{{ include "llm-d-modelservice.eppSelectorLabels" . }}
 {{- if .Chart.AppVersion }}
 app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
 {{- end }}
 app.kubernetes.io/managed-by: {{ .Release.Service }}
 {{- end }}
 
 {{/*
-Selector labels
+EPP selector labels
 */}}
-{{- define "llm-d-modelservice.selectorLabels" -}}
+{{- define "llm-d-modelservice.eppSelectorLabels" -}}
 app.kubernetes.io/name: {{ include "llm-d-modelservice.name" . }}
 app.kubernetes.io/instance: {{ .Release.Name }}
+llm-d.ai/epp: {{ include "llm-d-modelservice.fullname" . }}-epp
 {{- end }}
 
 {{/*
 Create the name of the service account to use
 */}}
 {{- define "llm-d-modelservice.serviceAccountName" -}}
 {{- if .Values.serviceAccount.create }}
-{{- default (include "llm-d-modelservice.fullname" .) .Values.serviceAccount.name }}
+{{- (include "llm-d-modelservice.fullname" .) -}}-sa
+{{- else }}
+{{- default "default" .Values.serviceAccount.name }}
+{{- end }}
+{{- end }}
+
+{{/*
+Create the name of the EPP service account to use
+*/}}
+{{- define "llm-d-modelservice.eppServiceAccountName" -}}
+{{- if .Values.serviceAccount.create }}
+{{- (include "llm-d-modelservice.fullname" .) -}}-epp-sa
 {{- else }}
 {{- default "default" .Values.serviceAccount.name }}
 {{- end }}
 
@@ -0,0 +1,91 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{ include "llm-d-modelservice.fullname" . }}-epp
+  labels:
+    {{- include "llm-d-modelservice.labels" . | nindent 4 }}
+spec:
+  {{- if not .Values.endpointPicker.autoscaling.enabled }}
+  replicas: {{ .Values.endpointPicker.replicaCount }}
+  {{- end }}
+  selector:
+    matchLabels:
+      {{- include "llm-d-modelservice.eppSelectorLabels" . | nindent 6 }}
+  template:
+    metadata:
+      labels:
+        {{- include "llm-d-modelservice.labels" . | nindent 8 }}
+    spec:
+      {{- with .Values.endpointPicker.imagePullSecrets }}
+      imagePullSecrets:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      serviceAccountName: {{ include "llm-d-modelservice.eppServiceAccountName" . }}
+      {{- with .Values.endpointPicker.podSecurityContext }}
+      securityContext:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- range $.Values.endpointPicker.containers }}
+      containers:
+        - name: {{ .name }}
+          {{- with $.Values.endpointPicker.securityContext }}
+          securityContext:
+            {{- toYaml . | nindent 12 }}
+          {{- end }}
+          image: "{{ .image }}"
+          {{- with .imagePullPolicy }}
+          imagePullPolicy: {{ . }}
+          {{- end }}
+          {{- with .command }}
+          command:
+            {{- toYaml . | nindent 12 }}
+          {{- end }}
+          args:
+            - poolName
+            - POOLNAME
+            - poolNamespace
+            - {{ $.Release.Namespace }}
+          {{- with .args }}
+            {{- toYaml . | nindent 12 }}
+          {{- end }}
+          {{- with .env }}
+          env:
+            {{- toYaml . | nindent 12 }}
+          {{- end }}
+          ports:
+            - name: http2
+              containerPort: {{ $.Values.endpointPicker.service.port }}
+              protocol: TCP
+          {{- with .livenessProbe }}
+          livenessProbe:
+            {{- toYaml . | nindent 12 }}
+          {{- end }}
+          {{- with .readinessProbe }}
+          readinessProbe:
+            {{- toYaml . | nindent 12 }}
+          {{- end }}
+          {{- with .resources }}
+          resources:
+            {{- toYaml . | nindent 12 }}
+          {{- end }}
+          {{- with .volumeMounts }}
+          volumeMounts:
+            {{- toYaml . | nindent 12 }}
+          {{- end }}
+      {{- end }} {{/* range $.Values.endpointPicker.containers */}}
+      {{- with .Values.endpointPicker.volumes }}
+      volumes:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- with .Values.endpointPicker.nodeSelector }}
+      nodeSelector:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- with .Values.endpointPicker.affinity }}
+      affinity:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- with .Values.endpointPicker.tolerations }}
+      tolerations:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
@@ -0,0 +1,13 @@
+{{- if .Values.serviceAccount.create -}}
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: {{ include "llm-d-modelservice.eppServiceAccountName" . }}
+  labels:
+    {{- include "llm-d-modelservice.labels" . | nindent 4 }}
+  {{- with .Values.eppServiceAccount.annotations }}
+  annotations:
+    {{- toYaml . | nindent 4 }}
+  {{- end }}
+automountServiceAccountToken: {{ .Values.eppServiceAccount.automount }}
+{{- end }}
@@ -0,0 +1,15 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: {{ include "llm-d-modelservice.fullname" . }}-epp
+  labels:
+    {{- include "llm-d-modelservice.labels" . | nindent 4 }}
+spec:
+  type: {{ .Values.service.type }}
+  ports:
+    - port: {{ .Values.endpointPicker.service.port }}
+      targetPort: {{ .Values.endpointPicker.service.targetPort }}
+      protocol: TCP
+      appProtocol: {{ .Values.endpointPicker.service.appProtocol }}
+  selector:
+    {{- include "llm-d-modelservice.eppSelectorLabels" . | nindent 4 }}
@@ -0,0 +1,152 @@
+# TODO
+# decoupleScaling: false
+
+lws: false  # If true, creates LWS instead of deployments  
+inferencePool: true 
+inferenceModel: true 
+httpRoute: true 
+
+routing: 
+  # This is the model name for the OpenAI request
+  modelName: facebook/opt-125m
+  ports:
+    servicePort: 8000   # Sidecar listens on this port for requests. If there's no sidecar, the request goes here
+    internalPort: 8200  # Sidecar forwards request to vllm container on this port 
+
+modelArtifacts:
+  # When specfying the URI with `hf` prefix, the <repo-id>/<model-id> string
+  # is extracted and exposed as a template variable that can be used as {{ .HFModelName }}
+  
+  # uri: hf://facebook/opt-125m
+  type: hf                # oneOf ["hf", "oci", "pvc"]
+  artficat: facebook/opt-125m
+  authSecretName: "hf-secret"
+  size: 5Mi 
+
+# describe decode pods
+decode:
+  enableService: false
+  replicas: 1
+  
+  # for LWS
+  parallelism:  
+    tensor: 8
+    data: 16
+    dataLocal: 1 
+  
+  acceleratorTypes:
+    labelKey: nvidia.com/gpu.product
+    labelValues:
+      # According to the blog, Scout requires H100s
+      - NVIDIA-H100
+  # initContainers:
+  containers:
+  - name: "vllm"
+    image: "vllm-ai/vllm:latest"  
+    args:
+      - "HFModelName"
+    env:
+    - name: "VLLM_LOG_LEVEL"
+      value: "DEBUG"  # Set to DEBUG for more detailed logs, or INFO for less verbose logs
+    envFrom:
+      - configMapRef:
+          name: vllm-config
+    resources:
+      requests:
+        cpu: "1"          # Request 1 CPU core
+        memory: "4Gi"    # Request 4 GiB of memory
+      limits:
+        cpu: "2"          # Limit to 2 CPU cores
+        memory: "8Gi"     # Limit to 8 GiB of memory
+    mountModelVolume: true
+
+# describe the prefill pods (looks the same as above)
+prefill:
+  replicas: 1
+  containers:
+    - name: "vllm"
+      args:
+        - "HFModelName"
+    
+endpointPicker:
+  # This is for setting up a service more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/
+  service:
+    # This sets the service type more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/#publishing-services-service-types
+    type: ClusterIP
+    # This sets the ports more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/#field-spec-ports
+    port: 9002
+    targetPort: 9002
+    appProtocol: http2
+
+  # enableService: true
+
+  autoscaling:
+    enabled: false
+  replicas: 1
+  
+  containers:
+  - name: "epp"
+    image: "ghcr.io/llm-d/llm-d-inference-scheduler:0.0.3"  
+    # command: 
+    args:
+      # - -poolName
+      # - InferencePoolName
+      # - -poolNamespace
+      # - llmd-kalantar
+      - -v
+      - "5"
+      - --zap-encoder
+      - json
+      - -grpcPort
+      - "9002"
+      - -grpcHealthPort
+      - "9003"
+    env:
+      - name: PD_ENABLED
+        value: "true"
+      - name: PD_PROMPT_LEN_THRESHOLD
+        value: "10"
+    ports:
+      - containerPort: 9002
+        protocol: TCP
+      - containerPort: 9003
+        protocol: TCP
+      - containerPort: 9090
+        name: metrics
+        protocol: TCP
+    livenessProbe:
+      failureThreshold: 3
+      grpc:
+        port: 9003
+        service: envoy.service.ext_proc.v3.ExternalProcessor
+      initialDelaySeconds: 5
+      periodSeconds: 10
+    readinessProbe:
+      failureThreshold: 3
+      grpc:
+        port: 9003
+        service: envoy.service.ext_proc.v3.ExternalProcessor
+      initialDelaySeconds: 5
+      periodSeconds: 10
+
+
+
+
+# This section builds out the service account more information can be found here: https://kubernetes.io/docs/concepts/security/service-accounts/
+serviceAccount:
+  # Specifies whether a service account should be created
+  create: true
+  # Automatically mount a ServiceAccount's API credentials?
+  automount: true
+  # Annotations to add to the service account
+  annotations: {}
+
+# This section builds out the service account more information can be found here: https://kubernetes.io/docs/concepts/security/service-accounts/
+eppServiceAccount:
+  # Specifies whether a service account should be created
+  create: true
+  # Automatically mount a ServiceAccount's API credentials?
+  automount: true
+  # Annotations to add to the service account
+  annotations: {}
+