|
| 1 | +--- |
| 2 | +apiVersion: kueue.x-k8s.io/v1beta1 |
| 3 | +kind: ClusterQueue |
| 4 | +metadata: |
| 5 | + name: "emergency-cluster-queue" |
| 6 | +spec: |
| 7 | + description: "Cluster queue for the emergency training jobs (Climate Change, Alzheimer, Cancer)." |
| 8 | + cohort: "ai-for-humanity" |
| 9 | + namespaceSelector: {} # match all. |
| 10 | + flavorFungibility: |
| 11 | + whenCanBorrow: Borrow |
| 12 | + whenCanPreempt: Preempt |
| 13 | + preemption: |
| 14 | + reclaimWithinCohort: Any |
| 15 | + borrowWithinCohort: |
| 16 | + policy: LowerPriority |
| 17 | + withinClusterQueue: LowerPriority |
| 18 | + resourceGroups: |
| 19 | + - coveredResources: ["cpu", "memory"] |
| 20 | + flavors: |
| 21 | + - name: "default-flavor" |
| 22 | + resources: |
| 23 | + - name: "cpu" |
| 24 | + nominalQuota: 1 |
| 25 | + - name: "memory" |
| 26 | + nominalQuota: 2000Mi |
| 27 | + borrowingLimit: 500Mi |
| 28 | + |
| 29 | +--- |
| 30 | +apiVersion: kueue.x-k8s.io/v1beta1 |
| 31 | +kind: ClusterQueue |
| 32 | +metadata: |
| 33 | + name: llm-cluster-queue |
| 34 | +spec: |
| 35 | + description: "Cluster queue for LLM model workloads" |
| 36 | + cohort: ai-for-humanity |
| 37 | + namespaceSelector: {} |
| 38 | + flavorFungibility: |
| 39 | + whenCanBorrow: Borrow |
| 40 | + whenCanPreempt: TryNextFlavor |
| 41 | + preemption: |
| 42 | + reclaimWithinCohort: LowerPriority # only preempt Workloads in the cohort that have lower priority than the pending Workload. |
| 43 | + namespaceSelector: {} # match all. |
| 44 | + resourceGroups: |
| 45 | + - coveredResources: |
| 46 | + - "cpu" |
| 47 | + - "memory" |
| 48 | + flavors: |
| 49 | + - name: "default-flavor" |
| 50 | + resources: |
| 51 | + - name: "cpu" |
| 52 | + nominalQuota: 500m |
| 53 | + - name: "memory" |
| 54 | + nominalQuota: 500Mi |
| 55 | + borrowingLimit: 500Mi |
| 56 | + |
| 57 | +--- |
| 58 | +apiVersion: kueue.x-k8s.io/v1beta1 |
| 59 | +kind: ClusterQueue |
| 60 | +metadata: |
| 61 | + name: gai-cluster-queue |
| 62 | +spec: |
| 63 | + description: "Cluster queue for GAI model workloads" |
| 64 | + cohort: ai-against-humanity |
| 65 | + namespaceSelector: {} |
| 66 | + preemption: |
| 67 | + reclaimWithinCohort: Never # do not preempt Workloads in the cohort. |
| 68 | + flavorFungibility: |
| 69 | + whenCanBorrow: Borrow # this is the default but I'm making it explicit here |
| 70 | + whenCanPreempt: Preempt # ensures that accelerators aren't hit with compute workloads |
| 71 | + resourceGroups: |
| 72 | + - coveredResources: |
| 73 | + - "gpu" |
| 74 | + flavors: |
| 75 | + - name: "gpu" |
| 76 | + resources: |
| 77 | + - name: "gpu" |
| 78 | + nominalQuota: 48Gi |
| 79 | + |
| 80 | + |
0 commit comments