Description
What happened:
已经分配的pod注解中hami.io/vgpu-devices-to-allocate不被清除
而且注解中的显卡uuid和实际上pod内部nvidia-smi -L查询出来的uuid不相同
pod的yaml:
`apiVersion: v1
kind: Pod
metadata:
annotations:
aip/deployment-history-id: 0998e190-1d93-40af-855c-8b4befd091fd
hami.io/bind-phase: success
hami.io/bind-time: "1744011458"
hami.io/vgpu-devices-allocated: GPU-718c4646-9de9-0850-a87d-9001a93bd69c,NVIDIA,2000,0:;
hami.io/vgpu-devices-to-allocate: GPU-718c4646-9de9-0850-a87d-9001a93bd69c,NVIDIA,2000,0:;
hami.io/vgpu-node: dashuju-gpu-04
hami.io/vgpu-time: "1744017459"
creationTimestamp: "2025-04-07T07:37:38Z"
generateName: a2e16143-9fe9-4504-9673-ec17c6cad005-654f4566f7-
labels:
aip/deployment-id: a2e16143-9fe9-4504-9673-ec17c6cad005
hami.io/vgpu-node: dashuju-gpu-04
juicefs-uniqueid: ""
pod-template-hash: 654f4566f7
managedFields:
- apiVersion: v1
fieldsType: FieldsV1
fieldsV1:
f:metadata:
f:annotations:
.: {}
f:aip/deployment-history-id: {}
f:generateName: {}
f:labels:
.: {}
f:aip/deployment-id: {}
f:pod-template-hash: {}
f:ownerReferences:
.: {}
k:{"uid":"43393d13-53c8-4504-8852-74b4f44d6743"}:
.: {}
f:apiVersion: {}
f:blockOwnerDeletion: {}
f:controller: {}
f:kind: {}
f:name: {}
f:uid: {}
f:spec:
f:affinity:
.: {}
f:podAntiAffinity:
.: {}
f:preferredDuringSchedulingIgnoredDuringExecution: {}
f:containers:
k:{"name":"aip-c8ddeda3528b461ea5cd715d9e424e9d"}:
.: {}
f:command: {}
f:image: {}
f:imagePullPolicy: {}
f:name: {}
f:ports:
.: {}
k:{"containerPort":29010,"protocol":"TCP"}:
.: {}
f:containerPort: {}
f:protocol: {}
f:resources:
.: {}
f:limits:
.: {}
f:cpu: {}
f:memory: {}
f:nvidia.com/gpu: {}
f:nvidia.com/gpumem: {}
f:requests:
.: {}
f:cpu: {}
f:memory: {}
f:nvidia.com/gpu: {}
f:nvidia.com/gpumem: {}
f:terminationMessagePath: {}
f:terminationMessagePolicy: {}
f:volumeMounts:
.: {}
k:{"mountPath":"/home"}:
.: {}
f:mountPath: {}
f:name: {}
f:subPath: {}
f:dnsPolicy: {}
f:enableServiceLinks: {}
f:imagePullSecrets:
.: {}
k:{"name":"1-docker-registry"}:
.: {}
f:name: {}
f:nodeSelector:
.: {}
f:bdps.group/8a748260909c8bb00190b0685483059d: {}
f:restartPolicy: {}
f:schedulerName: {}
f:securityContext: {}
f:terminationGracePeriodSeconds: {}
f:tolerations: {}
f:volumes:
.: {}
k:{"name":"aip-ability-pvc-2"}:
.: {}
f:name: {}
f:persistentVolumeClaim:
.: {}
f:claimName: {}
k:{"name":"aip-portal-sensitive"}:
.: {}
f:name: {}
f:persistentVolumeClaim:
.: {}
f:claimName: {}
manager: kube-controller-manager
operation: Update
time: "2025-04-07T07:37:38Z" - apiVersion: v1
fieldsType: FieldsV1
fieldsV1:
f:metadata:
f:annotations:
f:hami.io/bind-phase: {}
manager: nvidia-device-plugin
operation: Update
time: "2025-04-07T07:37:38Z" - apiVersion: v1
fieldsType: FieldsV1
fieldsV1:
f:metadata:
f:annotations:
f:hami.io/bind-time: {}
f:hami.io/vgpu-devices-allocated: {}
f:hami.io/vgpu-devices-to-allocate: {}
f:hami.io/vgpu-node: {}
f:hami.io/vgpu-time: {}
f:labels:
f:hami.io/vgpu-node: {}
manager: scheduler
operation: Update
time: "2025-04-07T07:37:40Z" - apiVersion: v1
fieldsType: FieldsV1
fieldsV1:
f:metadata:
f:labels:
f:juicefs-uniqueid: {}
manager: juicefs-csi-driver
operation: Update
time: "2025-04-07T07:37:49Z" - apiVersion: v1
fieldsType: FieldsV1
fieldsV1:
f:status:
f:conditions:
k:{"type":"ContainersReady"}:
.: {}
f:lastProbeTime: {}
f:lastTransitionTime: {}
f:status: {}
f:type: {}
k:{"type":"Initialized"}:
.: {}
f:lastProbeTime: {}
f:lastTransitionTime: {}
f:status: {}
f:type: {}
k:{"type":"Ready"}:
.: {}
f:lastProbeTime: {}
f:lastTransitionTime: {}
f:status: {}
f:type: {}
f:containerStatuses: {}
f:hostIP: {}
f:phase: {}
f:podIP: {}
f:podIPs:
.: {}
k:{"ip":"10.244.7.67"}:
.: {}
f:ip: {}
f:startTime: {}
manager: kubelet
operation: Update
time: "2025-04-07T09:17:33Z" - apiVersion: v1
fieldsType: FieldsV1
fieldsV1:
f:status:
f:conditions:
.: {}
k:{"type":"PodScheduled"}:
.: {}
f:lastProbeTime: {}
f:lastTransitionTime: {}
f:message: {}
f:reason: {}
f:status: {}
f:type: {}
manager: kube-scheduler
operation: Update
time: "2025-04-07T09:17:39Z"
name: a2e16143-9fe9-4504-9673-ec17c6cad005-654f4566f7-2lk5x
namespace: "1"
ownerReferences: - apiVersion: apps/v1
blockOwnerDeletion: true
controller: true
kind: ReplicaSet
name: a2e16143-9fe9-4504-9673-ec17c6cad005-654f4566f7
uid: 43393d13-53c8-4504-8852-74b4f44d6743
resourceVersion: "187365186"
uid: e4e7a715-b15b-4fbe-a616-e6cb97f37c5c
spec:
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:- podAffinityTerm:
labelSelector:
matchExpressions:
- key: aip/deployment-id
operator: In
values:
- a2e16143-9fe9-4504-9673-ec17c6cad005
topologyKey: kubernetes.io/hostname
weight: 100
containers:
- podAffinityTerm:
- command:
- bash
- -c
- cd /home/nlp_hw_business_hall_113M_service;python3 nlp_hw_business_hall_service.py
image: harbor.harbor:30001/3d044511-45fd-439c-9e71-4482c1dde03c/hotword:v1
imagePullPolicy: Always
name: aip-c8ddeda3528b461ea5cd715d9e424e9d
ports: - containerPort: 29010
protocol: TCP
resources:
limits:
cpu: "1"
memory: 4000Mi
nvidia.com/gpu: "1"
nvidia.com/gpumem: 2k
requests:
cpu: "1"
memory: 4000Mi
nvidia.com/gpu: "1"
nvidia.com/gpumem: 2k
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts: - mountPath: /home
name: aip-ability-pvc-2
subPath: bf4da92c-601f-4d6e-8829-561cbc12fa58/b2cbfa6d-5600-4159-b3c9-e9b592d80c73/a2e16143-9fe9-4504-9673-ec17c6cad005/faed84ad-7c7d-430d-8d17-f75cf9032313 - mountPath: /var/run/secrets/kubernetes.io/serviceaccount
name: default-token-qxmhp
readOnly: true
dnsPolicy: ClusterFirst
enableServiceLinks: true
imagePullSecrets:
- name: 1-docker-registry
nodeName: dashuju-gpu-04
nodeSelector:
bdps.group/8a748260909c8bb00190b0685483059d: 8a748260909c8bb00190b0685483059d
preemptionPolicy: PreemptLowerPriority
priority: 0
restartPolicy: Always
schedulerName: hami-scheduler
securityContext: {}
serviceAccount: default
serviceAccountName: default
terminationGracePeriodSeconds: 30
tolerations: - effect: NoSchedule
key: gpu
operator: Exists - effect: NoExecute
key: node.kubernetes.io/not-ready
operator: Exists
tolerationSeconds: 300 - effect: NoExecute
key: node.kubernetes.io/unreachable
operator: Exists
tolerationSeconds: 300
volumes: - name: aip-ability-pvc-2
persistentVolumeClaim:
claimName: aip-ability-pvc-2 - name: aip-portal-sensitive
persistentVolumeClaim:
claimName: aip-portal-sensitive - name: default-token-qxmhp
secret:
defaultMode: 420
secretName: default-token-qxmhp
status:
conditions: - lastProbeTime: null
lastTransitionTime: "2025-04-07T07:37:38Z"
status: "True"
type: Initialized - lastProbeTime: null
lastTransitionTime: "2025-04-07T07:37:54Z"
status: "True"
type: Ready - lastProbeTime: null
lastTransitionTime: "2025-04-07T07:37:54Z"
status: "True"
type: ContainersReady - lastProbeTime: null
lastTransitionTime: "2025-04-07T09:17:39Z"
message: pod e4e7a715-b15b-4fbe-a616-e6cb97f37c5c is in the cache, so can't be
assumed
reason: SchedulerError
status: "False"
type: PodScheduled
containerStatuses: - containerID: containerd://375bdf9a498ceb86cd237e6f9b3f4fc9e7aa131501eef1c64ae1bcafd72b552d
image: harbor.harbor:30001/3d044511-45fd-439c-9e71-4482c1dde03c/hotword:v1
imageID: harbor.harbor:30001/3d044511-45fd-439c-9e71-4482c1dde03c/hotword@sha256:6127dde70e22d37514738bc8991eaf594f3e220b873c1f5ad401fabe4495d0f8
lastState: {}
name: aip-c8ddeda3528b461ea5cd715d9e424e9d
ready: true
restartCount: 0
started: true
state:
running:
startedAt: "2025-04-07T07:37:54Z"
hostIP: 192.168.129.15
phase: Running
podIP: 10.244.7.67
podIPs: - ip: 10.244.7.67
qosClass: Guaranteed
startTime: "2025-04-07T07:37:38Z"
`
What you expected to happen:
正常调度
How to reproduce it (as minimally and precisely as possible):
device-plugin-hami-device-plugin-2tjvc.log
kube-scheduler.log
vgpu-scheduler-extender.log
Environment:
- HAMi version:2.5.0
- k8s 1.20.14