Project-HAMi
diff --git a/‎charts/hami/templates/_helpers.tpl‎
Lines changed: 11 additions & 0 deletions b/‎charts/hami/templates/_helpers.tpl‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎charts/hami/templates/device-plugin/daemonsetmock.yaml‎
Lines changed: 55 additions & 0 deletions b/‎charts/hami/templates/device-plugin/daemonsetmock.yaml‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎charts/hami/templates/scheduler/clusterrole.yaml‎
Lines changed: 11 additions & 1 deletion b/‎charts/hami/templates/scheduler/clusterrole.yaml‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎charts/hami/templates/scheduler/device-configmap.yaml‎
Lines changed: 8 additions & 0 deletions b/‎charts/hami/templates/scheduler/device-configmap.yaml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎charts/hami/templates/scheduler/rolebinding.yaml‎
Lines changed: 15 additions & 0 deletions b/‎charts/hami/templates/scheduler/rolebinding.yaml‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎charts/hami/templates/scheduler/serviceaccount.yaml‎
Lines changed: 8 additions & 0 deletions b/‎charts/hami/templates/scheduler/serviceaccount.yaml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎charts/hami/values.yaml‎
Lines changed: 19 additions & 0 deletions b/‎charts/hami/values.yaml‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎docs/config.md‎
Lines changed: 2 additions & 0 deletions b/‎docs/config.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/config_cn.md‎
Lines changed: 2 additions & 0 deletions b/‎docs/config_cn.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎pkg/device/ascend/device.go‎
Lines changed: 5 additions & 0 deletions b/‎pkg/device/ascend/device.go‎
Lines changed: 5 additions & 0 deletions
@@ -48,6 +48,13 @@ The app name for DevicePlugin
 {{- printf "%s-device-plugin" ( include "hami-vgpu.fullname" . ) | trunc 63 | trimSuffix "-" -}}
 {{- end -}}
 
+{{/*
+  The app name for MockDevicePlugin
+  */}}
+{{- define "hami-vgpu.mock-device-plugin" -}}
+{{- printf "%s-mock-device-plugin" ( include "hami-vgpu.fullname" . ) | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+
 {{/*
 The tls secret name for Scheduler
 */}}
@@ -123,6 +130,10 @@ app.kubernetes.io/instance: {{ .Release.Name }}
 {{ include "common.images.image" (dict "imageRoot" .Values.devicePlugin.image "global" .Values.global "tag" .Values.global.imageTag) }}
 {{- end -}}
 
+{{- define "hami.mockDevicePlugin.image" -}}
+{{ include "common.images.image" (dict "imageRoot" .Values.mockDevicePlugin.image "global" .Values.global "tag" .Values.mockDevicePlugin.tag) }}
+{{- end -}}
+
 {{- define "hami.devicePlugin.monitor.image" -}}
 {{ include "common.images.image" (dict "imageRoot" .Values.devicePlugin.monitor.image "global" .Values.global "tag" .Values.global.imageTag) }}
 {{- end -}}
 
@@ -0,0 +1,55 @@
+{{- if .Values.mockDevicePlugin.enabled }}
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: {{ include "hami-vgpu.mock-device-plugin" . }}
+  namespace: {{ include "hami-vgpu.namespace" . }}
+spec:
+  selector:
+    matchLabels:
+      app.kubernetes.io/component: hami-mock-device-plugin
+      {{- include "hami-vgpu.selectorLabels" . | nindent 6 }}
+  template:
+    metadata:
+      annotations:
+        scheduler.alpha.kubernetes.io/critical-pod: ""
+      labels:
+        app.kubernetes.io/component: hami-mock-device-plugin
+        {{- include "hami-vgpu.selectorLabels" . | nindent 8 }}
+    spec:
+      serviceAccountName: {{ include "hami-vgpu.mock-device-plugin" . }}
+      tolerations:
+      - key: CriticalAddonsOnly
+        operator: Exists
+      containers:
+      - image: {{ include "hami.mockDevicePlugin.image" . }}
+        imagePullPolicy: {{ .Values.mockDevicePlugin.image.pullPolicy }}
+        name: hami-mock-dp-cntr
+        env:
+          - name: NODE_NAME
+            valueFrom:
+              fieldRef:
+                fieldPath: spec.nodeName
+        command:
+          - ./k8s-device-plugin
+          - -v=5
+          - --device-config-file=/device-config.yaml
+        volumeMounts:
+          - name: dp
+            mountPath: /var/lib/kubelet/device-plugins
+          - name: sys
+            mountPath: /sys
+          - name: device-config
+            mountPath: /device-config.yaml
+            subPath: device-config.yaml
+      volumes:
+        - name: dp
+          hostPath:
+            path: /var/lib/kubelet/device-plugins
+        - name: sys
+          hostPath:
+            path: /sys
+        - name: device-config
+          configMap:
+            name: {{ include "hami-vgpu.scheduler" . }}-device
+{{- end -}}
@@ -21,4 +21,14 @@ rules:
   - apiGroups: [""]
     resources: ["resourcequotas"]
     verbs: ["get", "list", "watch"]
-
+---
+{{- if .Values.mockDevicePlugin.enabled }}
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: {{ include "hami-vgpu.mock-device-plugin" . }}
+rules:
+  - apiGroups: [""]
+    resources: ["nodes"]
+    verbs: ["get", "update", "list", "patch"]
+{{- end -}}
@@ -21,6 +21,7 @@ data:
       defaultMemory: 0
       defaultCores: 0
       defaultGPUNum: 1
+      memoryFactor: 1
       deviceSplitCount: {{ .Values.devicePlugin.deviceSplitCount }}
       deviceMemoryScaling: {{ .Values.devicePlugin.deviceMemoryScaling }}
       deviceCoreScaling: {{ .Values.devicePlugin.deviceCoreScaling }}
@@ -235,6 +236,7 @@ data:
       resourceCountName: {{ .Values.dcuResourceName }}
       resourceMemoryName: {{ .Values.dcuResourceMem }}
       resourceCoreName: {{ .Values.dcuResourceCores }}
+      memoryFactor: 1
     metax:
       resourceCountName: "metax-tech.com/gpu"
       resourceVCountName: {{ .Values.metaxResourceName }}
@@ -286,6 +288,7 @@ data:
       resourceMemoryName: huawei.com/Ascend910A-memory
       memoryAllocatable: 32768
       memoryCapacity: 32768
+      memoryFactor: 1
       aiCore: 30
       templates:
         - name: vir02
@@ -306,6 +309,7 @@ data:
       resourceMemoryName: huawei.com/Ascend910B2-memory
       memoryAllocatable: 65536
       memoryCapacity: 65536
+      memoryFactor: 1
       aiCore: 24
       aiCPU: 6
       templates:
@@ -327,6 +331,7 @@ data:
       resourceMemoryName: huawei.com/Ascend910B3-memory
       memoryAllocatable: 65536
       memoryCapacity: 65536
+      memoryFactor: 1
       aiCore: 20
       aiCPU: 7
       templates:
@@ -344,6 +349,7 @@ data:
       resourceMemoryName: huawei.com/Ascend910B4-1-memory
       memoryAllocatable: 65536
       memoryCapacity: 65536
+      memoryFactor: 1
       aiCore: 20
       aiCPU: 7
       templates:
@@ -365,6 +371,7 @@ data:
       resourceMemoryName: huawei.com/Ascend910B4-memory
       memoryAllocatable: 32768
       memoryCapacity: 32768
+      memoryFactor: 1
       aiCore: 20
       aiCPU: 7
       templates:
@@ -382,6 +389,7 @@ data:
       resourceMemoryName: huawei.com/Ascend310P-memory
       memoryAllocatable: 21527
       memoryCapacity: 24576
+      memoryFactor: 1
       aiCore: 8
       aiCPU: 7
       templates:
 
@@ -14,3 +14,18 @@ subjects:
   - kind: ServiceAccount
     name: {{ include "hami-vgpu.scheduler" . }}
     namespace: {{ include "hami-vgpu.namespace" . }}
+---
+{{- if .Values.mockDevicePlugin.enabled }}
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: {{ include "hami-vgpu.mock-device-plugin" . }}
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: {{ include "hami-vgpu.mock-device-plugin" . }}
+subjects:
+  - kind: ServiceAccount
+    name: {{ include "hami-vgpu.mock-device-plugin" . }}
+    namespace: {{ include "hami-vgpu.namespace" . }}
+{{- end -}}
@@ -6,3 +6,11 @@ metadata:
   labels:
     app.kubernetes.io/component: "hami-scheduler"
     {{- include "hami-vgpu.labels" . | nindent 4 }}
+---
+{{- if .Values.mockDevicePlugin.enabled }}
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: {{ include "hami-vgpu.mock-device-plugin" . }}
+  namespace: {{ include "hami-vgpu.namespace" . }}
+{{- end -}}
@@ -374,6 +374,25 @@ devicePlugin:
 #      cpu: 100m
 #      memory: 100Mi
 
+mockDevicePlugin:
+  enabled: false
+  image:
+    registry: "docker.io"
+    repository: "projecthami/mock-device-plugin"
+    tag: "0.1.0"
+    ## Specify a imagePullPolicy
+    ## Defaults to 'Always' if image tag is 'latest', else set to 'IfNotPresent'
+    ## ref: https://kubernetes.io/docs/user-guide/images/#pre-pulling-images
+    ##
+    pullPolicy: IfNotPresent
+    ## Optionally specify an array of imagePullSecrets.
+    ## Secrets must be manually created in the namespace.
+    ## Example:
+    ## pullSecrets:
+    ##   - myRegistryKeySecretName
+    ##
+    pullSecrets: []
+
 devices:
   amd:
     customresources:
 
@@ -31,6 +31,8 @@ You can update these configurations using one of the following methods:
   Note: When a container requests `nvidia.com/gpu` and its GPU memory reservation is exclusive (for example `nvidia.com/gpumem-percentage` is 100, or memory fields are omitted so `nvidia.defaultMem` remains 0 and defaults to 100%), and the pod spec does not set `nvidia.com/gpucores`, HAMi defaults `nvidia.com/gpucores` to 100 during admission. Non-exclusive memory requests or pods that already set `nvidia.com/gpucores` remain unchanged.
 * `nvidia.defaultGPUNum`: 
   Integer type, by default: equals 1, if configuration value is 0, then the configuration value will not take effect and will be filtered. when a user does not set nvidia.com/gpu this key in pod resource, webhook should check nvidia.com/gpumem、resource-mem-percentage、nvidia.com/gpucores this three key, anyone a key having value, webhook should add nvidia.com/gpu key and this default value to resources limits map.
+* `nvidia.memoryFactor`:
+  Integer type, by default: equals 1. During resource requests, the actual value of `nvidia.com/gpumem` will be multiplied by this factor. If `mock-device-plugin` is deployed, the actual value `nvidia.com/gpumem` in `node.status.capacity` will also be amplified by the corresponding multiple.
 * `nvidia.resourceCountName`: 
   String type, vgpu number resource name, default: "nvidia.com/gpu"
 * `nvidia.resourceMemoryName`: 
 
@@ -32,6 +32,8 @@
 * `nvidia.defaultGPUNum`：
   整数类型，默认为 1，如果配置为 0，则配置不会生效。当用户在 Pod 资源中没有设置 nvidia.com/gpu 这个 key 时，webhook 会检查 nvidia.com/gpumem、
   resource-mem-percentage、nvidia.com/gpucores 这三个 key 中的任何一个 key 有值，webhook 都会添加 nvidia.com/gpu 键和此默认值到 resources limit 中。
+* `nvidia.memoryFactor`:
+  整数类型，默认为 1。在资源申请时`nvidia.com/gpumem`的真实值会放大相应的倍数。如果部署了`mock-device-plugin`, 在`node.status.capacity`的真实值也会放大对应的倍数。
 * `nvidia.resourceCountName`：
   字符串类型，申请 vgpu 个数的资源名，默认："nvidia.com/gpu"
 * `nvidia.resourceMemoryName`：
 
@@ -263,6 +263,11 @@ func (dev *Devices) GenerateResourceRequests(ctr *corev1.Container) device.Conta
 			if ok {
 				memnums, ok := mem.AsInt64()
 				if ok {
+					if dev.config.MemoryFactor > 1 {
+						rawMemnums := memnums
+						memnums = memnums * int64(dev.config.MemoryFactor)
+						klog.V(4).Infof("Update Ascend memory request. before %d, after %d, factor %d", rawMemnums, memnums, dev.config.MemoryFactor)
+					}
 					m, _ := dev.trimMemory(memnums)
 					memnum = int(m)
 				}