From 90ee83e3aba5080a9a44a57445391f6abd4ab8a6 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 11 Nov 2025 09:45:39 +0000 Subject: [PATCH 1/4] Initial plan From a7ef7af47bb6a52c8dff75806a256cfca864a417 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 11 Nov 2025 12:40:12 +0000 Subject: [PATCH 2/4] Refactor Plugins documentation into separate pages under Scheduler menu Co-authored-by: JesseStutler <38534065+JesseStutler@users.noreply.github.com> --- config/_default/menus.toml | 2 +- content/en/docs/binpack.md | 26 ++++ content/en/docs/drf.md | 26 ++++ content/en/docs/gang.md | 28 +++++ content/en/docs/nodeorder.md | 24 ++++ content/en/docs/numa-aware.md | 34 ++++++ content/en/docs/plugins-overview.md | 34 ++++++ content/en/docs/plugins.md | 176 ---------------------------- content/en/docs/predicates.md | 24 ++++ content/en/docs/priority.md | 28 +++++ content/en/docs/proportion.md | 24 ++++ content/en/docs/sla.md | 24 ++++ content/en/docs/task-topology.md | 32 +++++ content/en/docs/tdm.md | 24 ++++ 14 files changed, 329 insertions(+), 177 deletions(-) create mode 100644 content/en/docs/binpack.md create mode 100644 content/en/docs/drf.md create mode 100644 content/en/docs/gang.md create mode 100644 content/en/docs/nodeorder.md create mode 100644 content/en/docs/numa-aware.md create mode 100644 content/en/docs/plugins-overview.md delete mode 100644 content/en/docs/plugins.md create mode 100644 content/en/docs/predicates.md create mode 100644 content/en/docs/priority.md create mode 100644 content/en/docs/proportion.md create mode 100644 content/en/docs/sla.md create mode 100644 content/en/docs/task-topology.md create mode 100644 content/en/docs/tdm.md diff --git a/config/_default/menus.toml b/config/_default/menus.toml index 5604ce52..dee8a21a 100644 --- a/config/_default/menus.toml +++ b/config/_default/menus.toml @@ -90,7 +90,7 @@ [[docs]] name = "Contribution" - weight = 8 + weight = 9 identifier = "contribution" # Documentation version v1.12.0 diff --git a/content/en/docs/binpack.md b/content/en/docs/binpack.md new file mode 100644 index 00000000..8db659d6 --- /dev/null +++ b/content/en/docs/binpack.md @@ -0,0 +1,26 @@ ++++ +title = "Binpack Plugin" + +date = 2021-05-13 +lastmod = 2021-05-13 + +draft = false # Is this a draft? true/false +toc = true # Show table of contents? true/false +type = "docs" # Do not modify. + +# Add menu entry to sidebar. +linktitle = "Binpack" +[menu.docs] + parent = "plugins" + weight = 2 ++++ + +## Overview + +The goal of the BinPack scheduling algorithm is to fill as many existing nodes as possible (try not to allocate blank nodes). In the concrete implementation, BinPack scheduling algorithm scores the nodes that can be delivered, and the higher the score, the higher the resource utilization rate of nodes. Binpack algorithm can fill up the nodes as much as possible to close the application load to some nodes, which is very conducive to the automatic expansion capacity function of K8s cluster nodes. + +The BinPack algorithm is injected into the Volcano-Scheduler process as a plug-in and will be applied during the Pod stage of node selection. When calculating the Binpack algorithm, the Volcano-Scheduler considers the various resources requested by Pod and averages them according to the weights configured for each resource. The weight of each resource in the node score calculation is different, depending on the weight value configured by the administrator for each resource. Different plug-ins also need to assign different weights when calculating node scores, and the Scheduler also sets the score weights for BinPack plugins. + +## Scenario + +The BinPack algorithm is good for small jobs that can fill as many nodes as possible. For example, the single query job in the big data scene, the order generation in the e-commerce seckill scene, the single identification job in the AI scene, and the high concurrency service scene on the Internet, etc. This scheduling algorithm can reduce the fragmentation in the node as much as possible, and reserve enough resource space on the idle machine for Pod which has applied for more resource requests, so as to maximize the utilization of idle resources under the cluster. diff --git a/content/en/docs/drf.md b/content/en/docs/drf.md new file mode 100644 index 00000000..974adbae --- /dev/null +++ b/content/en/docs/drf.md @@ -0,0 +1,26 @@ ++++ +title = "DRF Plugin" + +date = 2021-05-13 +lastmod = 2021-05-13 + +draft = false # Is this a draft? true/false +toc = true # Show table of contents? true/false +type = "docs" # Do not modify. + +# Add menu entry to sidebar. +linktitle = "DRF" +[menu.docs] + parent = "plugins" + weight = 4 ++++ + +{{
}} + +## Overview + +The full name of DRF scheduling algorithm is Dominant Resource Fairness, which is a scheduling algorithm based on the container group Dominant Resource. Dominant Resource is the largest percentage of all required resources for a container group. The DRF algorithm selects the Dominant Resource that is the smallest in a series of container groups for priority scheduling. This can meet more job, not because a fat business, starve a large number of small business. DRF scheduling algorithm can ensure that many types of resources coexist in the environment, as far as possible to meet the fair principle of allocation. + +## Scenario + +The DRF scheduling algorithm gives priority to the throughput of the business in the cluster and is suitable for batch small business scenarios such as a single AI training, a single big data calculation and a query. diff --git a/content/en/docs/gang.md b/content/en/docs/gang.md new file mode 100644 index 00000000..04734a0f --- /dev/null +++ b/content/en/docs/gang.md @@ -0,0 +1,28 @@ ++++ +title = "Gang Plugin" + +date = 2021-05-13 +lastmod = 2021-05-13 + +draft = false # Is this a draft? true/false +toc = true # Show table of contents? true/false +type = "docs" # Do not modify. + +# Add menu entry to sidebar. +linktitle = "Gang" +[menu.docs] + parent = "plugins" + weight = 1 ++++ + +{{
}} + +## Overview + +The Gang scheduling strategy is one of the core scheduling algorithms of the Volcano-Scheduler. It meets the scheduling requirements of "All or nothing" in the scheduling process and avoids the waste of cluster resources caused by arbitrary scheduling of Pod. The Gang scheduler algorithm is to observe whether the scheduled number of Pods under Job meets the minimum number of runs. When the minimum number of runs of Job is satisfied, the scheduling action is executed for all Pods under Job; otherwise, it is not executed. + +## Scenario + +The Gang scheduling algorithm based on the container group concept is well suited for scenarios that require multi-process collaboration. AI scenes often contain complex processes. Data Ingestion, Data Analysts, Data Splitting, trainers, Serving, Logging, etc., which require a group of containers to work together, are suitable for container-based Gang scheduling strategies. Multi-thread parallel computing communication scenarios under MPI computing framework are also suitable for Gang scheduling because master and slave processes need to work together. Containers under the container group are highly correlated, and there may be resource contention. The overall scheduling allocation can effectively solve the deadlock. + +In the case of insufficient cluster resources, the scheduling strategy of Gang can significantly improve the utilization of cluster resources. diff --git a/content/en/docs/nodeorder.md b/content/en/docs/nodeorder.md new file mode 100644 index 00000000..7d182099 --- /dev/null +++ b/content/en/docs/nodeorder.md @@ -0,0 +1,24 @@ ++++ +title = "Nodeorder Plugin" + +date = 2021-05-13 +lastmod = 2021-05-13 + +draft = false # Is this a draft? true/false +toc = true # Show table of contents? true/false +type = "docs" # Do not modify. + +# Add menu entry to sidebar. +linktitle = "Nodeorder" +[menu.docs] + parent = "plugins" + weight = 8 ++++ + +## Overview + +The NodeOrder Plugin is a scheduling optimization strategy that scores nodes from various dimensions through simulated assignments to find the node that is best suited for the current job. The scoring parameters are configured by the user. The parameter contains the Affinity、reqResource、LeastReqResource、MostResource、balanceReqResouce. + +## Scenario + +NodeOrder Plugin provides scoring criteria of multiple dimensions for scheduling, and the combination of different dimensions enables users to flexibly configure appropriate scheduling policies according to their own needs. diff --git a/content/en/docs/numa-aware.md b/content/en/docs/numa-aware.md new file mode 100644 index 00000000..d4725a9e --- /dev/null +++ b/content/en/docs/numa-aware.md @@ -0,0 +1,34 @@ ++++ +title = "Numa-aware Plugin" + +date = 2021-05-13 +lastmod = 2021-05-13 + +draft = false # Is this a draft? true/false +toc = true # Show table of contents? true/false +type = "docs" # Do not modify. + +# Add menu entry to sidebar. +linktitle = "Numa-aware" +[menu.docs] + parent = "plugins" + weight = 11 ++++ + +## Overview + +When the node runs many CPU-bound pods, the workload can move to different CPU cores depending on whether the pod is throttled and which CPU cores are available at scheduling time. Many workloads are not sensitive to this migration and thus work fine without any intervention. However, in workloads where CPU cache affinity and scheduling latency significantly affect workload performance, the kubelet allows alternative CPU management policies to determine some placement preferences on the node. + +The CPU Manager and the Topology Manager are all Kubelet components, However There is the following limitation: + +- The scheduler is not topology-aware. so it is possible to be scheduled on a node and then fail on the node due to the Topology Manager. this is unacceptable for TensorFlow job. If any worker or ps failed on node, the job will fail. +- The managers are node-level that results in an inability to match the best node for NUMA topology in the whole cluster. + +The Numa-Aware Plugin aims to address these limitations. + +- Support cpu resource topology scheduling. +- Support pod-level topology policies. + +## Scenario + +Common scenarios for NUMA-Aware are computation-intensive jobs that are sensitive to CPU parameters, scheduling delays. Such as scientific calculation, video decoding, animation rendering, big data offline processing and other specific scenes. diff --git a/content/en/docs/plugins-overview.md b/content/en/docs/plugins-overview.md new file mode 100644 index 00000000..a2a5d3f9 --- /dev/null +++ b/content/en/docs/plugins-overview.md @@ -0,0 +1,34 @@ ++++ +title = "Plugins Overview" + +date = 2021-05-13 +lastmod = 2021-05-13 + +draft = false # Is this a draft? true/false +toc = false # Show table of contents? true/false +type = "docs" # Do not modify. + +linktitle = "Plugins" +[menu.docs] + parent = "scheduler" + weight = 3 + identifier = "plugins" ++++ + +Volcano scheduler provides a rich set of plugins to support various scheduling scenarios. Each plugin implements specific scheduling algorithms and policies. + +## Available Plugins + +- **Gang**: All-or-nothing scheduling for batch jobs +- **Binpack**: Maximize node resource utilization +- **Priority**: Job and task priority-based scheduling +- **DRF**: Dominant Resource Fairness scheduling +- **Proportion**: Queue-based resource allocation +- **Task-topology**: Affinity and anti-affinity based scheduling +- **Predicates**: Job pre-selection and filtering +- **Nodeorder**: Multi-dimensional node scoring +- **SLA**: Service Level Agreement enforcement +- **TDM**: Time Division Multiplexing for shared nodes +- **Numa-aware**: NUMA topology-aware scheduling + +Please select a specific plugin from the submenu to learn more about its functionality and use cases. diff --git a/content/en/docs/plugins.md b/content/en/docs/plugins.md deleted file mode 100644 index cc8020ba..00000000 --- a/content/en/docs/plugins.md +++ /dev/null @@ -1,176 +0,0 @@ -+++ -title = "Plugins" - -date = 2021-05-13 -lastmod = 2021-05-13 - -draft = false # Is this a draft? true/false -toc = true # Show table of contents? true/false -type = "docs" # Do not modify. - -# Add menu entry to sidebar. -linktitle = "Plugins" -[menu.docs] - parent = "scheduler" - weight = 3 -+++ - - - -### Gang - -{{
}} - -#### Overview - -The Gang scheduling strategy is one of the core scheduling algorithms of the Volcano-Scheduler. It meets the scheduling requirements of "All or nothing" in the scheduling process and avoids the waste of cluster resources caused by arbitrary scheduling of Pod. The Gang scheduler algorithm is to observe whether the scheduled number of Pods under Job meets the minimum number of runs. When the minimum number of runs of Job is satisfied, the scheduling action is executed for all Pods under Job; otherwise, it is not executed. - -#### Scenario - -The Gang scheduling algorithm based on the container group concept is well suited for scenarios that require multi-process collaboration. AI scenes often contain complex processes. Data Ingestion, Data Analysts, Data Splitting, trainers, Serving, Logging, etc., which require a group of containers to work together, are suitable for container-based Gang scheduling strategies. Multi-thread parallel computing communication scenarios under MPI computing framework are also suitable for Gang scheduling because master and slave processes need to work together. Containers under the container group are highly correlated, and there may be resource contention. The overall scheduling allocation can effectively solve the deadlock. - -In the case of insufficient cluster resources, the scheduling strategy of Gang can significantly improve the utilization of cluster resources. - -### Binpack - -#### Overview - -The goal of the BinPack scheduling algorithm is to fill as many existing nodes as possible (try not to allocate blank nodes). In the concrete implementation, BinPack scheduling algorithm scores the nodes that can be delivered, and the higher the score, the higher the resource utilization rate of nodes. Binpack algorithm can fill up the nodes as much as possible to close the application load to some nodes, which is very conducive to the automatic expansion capacity function of K8s cluster nodes. - -The BinPack algorithm is injected into the Volcano-Scheduler process as a plug-in and will be applied during the Pod stage of node selection. When calculating the Binpack algorithm, the Volcano-Scheduler considers the various resources requested by Pod and averages them according to the weights configured for each resource. The weight of each resource in the node score calculation is different, depending on the weight value configured by the administrator for each resource. Different plug-ins also need to assign different weights when calculating node scores, and the Scheduler also sets the score weights for BinPack plugins. - -#### Scenario - -The BinPack algorithm is good for small jobs that can fill as many nodes as possible. For example, the single query job in the big data scene, the order generation in the e-commerce seckill scene, the single identification job in the AI scene, and the high concurrency service scene on the Internet, etc. This scheduling algorithm can reduce the fragmentation in the node as much as possible, and reserve enough resource space on the idle machine for Pod which has applied for more resource requests, so as to maximize the utilization of idle resources under the cluster. - - - -### Priority - -{{
}} - -#### Overview - -The Priority Plugin provides the implementation of job, Task sorting, and PreempTablefn, a function that calculates sacrifice jobs. Job sorting according to priorityClassName, the task of sorting by priorityClassName, createTime, id in turn. - -#### Scenario - -When the cluster runs multiple jobs but is low on resources, and each Job has a different number of Pods waiting to be scheduled, if you use the Kubernetes default scheduler, the Job with more Pods will eventually get more of the cluster's resources. In this case, the Volcano-Scheduler provides algorithms that enable different jobs to share cluster resources in a fair-share. - -The Priority Plugin enables users to customize their job and task priorities, and to customize scheduling policies at different levels according to their own needs. Priority is arranged according to Job's PriorityClassName at the application level. For example, there are financial scenarios, Internet of Things monitoring scenarios and other applications requiring high real-time performance in the cluster, and the Priority Plugin can ensure that they are scheduled in Priority. - - - -### DRF - -{{
}} - -#### Overview - -The full name of DRF scheduling algorithm is Dominant Resource Fairness, which is a scheduling algorithm based on the container group Dominant Resource. Dominant Resource is the largest percentage of all required resources for a container group. The DRF algorithm selects the Dominant Resource that is the smallest in a series of container groups for priority scheduling. This can meet more job, not because a fat business, starve a large number of small business. DRF scheduling algorithm can ensure that many types of resources coexist in the environment, as far as possible to meet the fair principle of allocation. - -#### Scenario - -The DRF scheduling algorithm gives priority to the throughput of the business in the cluster and is suitable for batch small business scenarios such as a single AI training, a single big data calculation and a query. - - - -### Proportion - -#### Overview - -Proportion scheduling algorithm uses the concept of queue to control the Proportion of total resources allocated in the cluster. Each queue allocates a certain proportion of cluster resources. For example, there are three teams that share A pool of resources on A cluster: Team A uses up to 40% of the total cluster, Team B uses up to 30%, and Team C uses up to 30%. If the amount of work delivered exceeds the team's maximum available resources, there is a queue. - -#### Scenario - -Proportion scheduling algorithm improves the flexibility and elasticity of cluster scheduling. The most typical scenario is that when multiple development teams in a company share a cluster, this scheduling algorithm can handle the requirements of shared resource matching and isolation between different departments very well. In multi-service mixed scenarios, such as computation-intensive AI business, network IO-intensive MPI and HPC business, and storage-intensive big data business, Proportion scheduling algorithm can allocate shared resources according to demand through matching. - - - -### Task-topology - -#### Overview - -The task-topology algorithm is an algorithm that computes the priority of tasks and nodes based on the affinity and anti-affinity configuration between tasks within a Job. By configuring the affinity and anti-affinity policies between tasks within the Job and using the Task-Topology algorithm, tasks with affinity configurations can be scheduled to the same node first, and PODs with anti-affinity configurations to different nodes. - -#### Scenario - -node affinity: - -- Task-topology is important for improving computational efficiency in deep learning computing scenarios. Using the TensorFlow calculation as an example, configure the affinity between "ps" and "worker". Task-topology algorithm enables "ps" and "worker" to be scheduled to the same node as far as possible, so as to improve the efficiency of network and data interaction between "ps" and "worker", thus improving the computing efficiency. -- Tasks in HPC and MPI scenarios are highly synchronized and need high-speed network IO. - -Anti-affinity: - -- Take the TensorFlow calculation as an example, the anti-affinity between "ps" and "ps" - -- Master and slave backup of e-commerce service scene, data disaster tolerant, to ensure that there are spare jobs to continue to provide service after a job fails. - - - -### Predicates - -#### Overview - -The Predicate Plugin calls the PredicateGPU with pod and nodeInfo as parameters to evaluate and pre-select jobs based on the results. - -#### Scenario - -In AI scenarios where GPU resources are required, the Predicate Plugin can quickly filter out those that require the GPU for centralized scheduling. - - -### Nodeorder - -#### Overview - -The NodeOrder Plugin is a scheduling optimization strategy that scores nodes from various dimensions through simulated assignments to find the node that is best suited for the current job. The scoring parameters are configured by the user. The parameter contains the Affinity、reqResource、LeastReqResource、MostResource、balanceReqResouce. - -#### Scenario - -NodeOrder Plugin provides scoring criteria of multiple dimensions for scheduling, and the combination of different dimensions enables users to flexibly configure appropriate scheduling policies according to their own needs. - - - -### SLA - -#### Overview - -When users apply jobs to Volcano, they may need adding some particular constraints to job, for example, longest Pending time aiming to prevent job from starving. And these constraints can be regarded as Service Level Agreement (SLA) which are agreed between volcano and user. So sla plugin is provided to receive and realize SLA settings for both individual job and whole cluster. - -#### Scenario - -Users can customize SLA related parameters in their own cluster according to business needs. For example, for clusters with high real-time service requirements, JobWaitingTime can be set as small as possible. For clusters with bulk computing jobs, JobWaitingTime can be set to larger. The parameters of a specific SLA and the optimization of the parameters need to be combined with the specific business and related performance measurement results. - -### TDM - -#### Overview - -The full name of TDM is Time Division Multiplexing. In a co-located environment, some nodes are in both Kubernetes cluster and Yarn cluster. For these nodes, Kubernetes and Yarn cluster can use these resource by time-sharing multiplexing.The TDM Plugin marks these nodes as `revocable nodes`. TDM plugin will try to dispatch `preemptable task` to `revocable node` in node revocable time and evict the `preemptable task` from `revocable node` out of revocable time.. TDM Plugin improves the time-division multiplexing ability of node resources in the scheduling process of Volcano. - -#### Scenario - -In ToB business, cloud vendors provide cloud-based resources for merchants, and different merchants adopt different container arrangement frameworks (Kubernetes/YARN, etc.). TDM Plugin improves the time-sharing efficiency of common node resources and further improves the utilization rate of resources. - - - -### Numa-aware - -#### Overview - -When the node runs many CPU-bound pods, the workload can move to different CPU cores depending on whether the pod is throttled and which CPU cores are available at scheduling time. Many workloads are not sensitive to this migration and thus work fine without any intervention. However, in workloads where CPU cache affinity and scheduling latency significantly affect workload performance, the kubelet allows alternative CPU management policies to determine some placement preferences on the node. - -The CPU Manager and the Topology Manager are all Kubelet components, However There is the following limitation: - -- The scheduler is not topology-aware. so it is possible to be scheduled on a node and then fail on the node due to the Topology Manager. this is unacceptable for TensorFlow job. If any worker or ps failed on node, the job will fail. -- The managers are node-level that results in an inability to match the best node for NUMA topology in the whole cluster. - -The Numa-Aware Plugin aims to address these limitations. - -- Support cpu resource topology scheduling. -- Support pod-level topology policies. - -#### Scenario - -Common scenarios for NUMA-Aware are computation-intensive jobs that are sensitive to CPU parameters, scheduling delays. Such as scientific calculation, video decoding, animation rendering, big data offline processing and other specific scenes. - - diff --git a/content/en/docs/predicates.md b/content/en/docs/predicates.md new file mode 100644 index 00000000..2c375aa8 --- /dev/null +++ b/content/en/docs/predicates.md @@ -0,0 +1,24 @@ ++++ +title = "Predicates Plugin" + +date = 2021-05-13 +lastmod = 2021-05-13 + +draft = false # Is this a draft? true/false +toc = true # Show table of contents? true/false +type = "docs" # Do not modify. + +# Add menu entry to sidebar. +linktitle = "Predicates" +[menu.docs] + parent = "plugins" + weight = 7 ++++ + +## Overview + +The Predicate Plugin calls the PredicateGPU with pod and nodeInfo as parameters to evaluate and pre-select jobs based on the results. + +## Scenario + +In AI scenarios where GPU resources are required, the Predicate Plugin can quickly filter out those that require the GPU for centralized scheduling. diff --git a/content/en/docs/priority.md b/content/en/docs/priority.md new file mode 100644 index 00000000..27c86ae4 --- /dev/null +++ b/content/en/docs/priority.md @@ -0,0 +1,28 @@ ++++ +title = "Priority Plugin" + +date = 2021-05-13 +lastmod = 2021-05-13 + +draft = false # Is this a draft? true/false +toc = true # Show table of contents? true/false +type = "docs" # Do not modify. + +# Add menu entry to sidebar. +linktitle = "Priority" +[menu.docs] + parent = "plugins" + weight = 3 ++++ + +{{
}} + +## Overview + +The Priority Plugin provides the implementation of job, Task sorting, and PreempTablefn, a function that calculates sacrifice jobs. Job sorting according to priorityClassName, the task of sorting by priorityClassName, createTime, id in turn. + +## Scenario + +When the cluster runs multiple jobs but is low on resources, and each Job has a different number of Pods waiting to be scheduled, if you use the Kubernetes default scheduler, the Job with more Pods will eventually get more of the cluster's resources. In this case, the Volcano-Scheduler provides algorithms that enable different jobs to share cluster resources in a fair-share. + +The Priority Plugin enables users to customize their job and task priorities, and to customize scheduling policies at different levels according to their own needs. Priority is arranged according to Job's PriorityClassName at the application level. For example, there are financial scenarios, Internet of Things monitoring scenarios and other applications requiring high real-time performance in the cluster, and the Priority Plugin can ensure that they are scheduled in Priority. diff --git a/content/en/docs/proportion.md b/content/en/docs/proportion.md new file mode 100644 index 00000000..973e2e52 --- /dev/null +++ b/content/en/docs/proportion.md @@ -0,0 +1,24 @@ ++++ +title = "Proportion Plugin" + +date = 2021-05-13 +lastmod = 2021-05-13 + +draft = false # Is this a draft? true/false +toc = true # Show table of contents? true/false +type = "docs" # Do not modify. + +# Add menu entry to sidebar. +linktitle = "Proportion" +[menu.docs] + parent = "plugins" + weight = 5 ++++ + +## Overview + +Proportion scheduling algorithm uses the concept of queue to control the Proportion of total resources allocated in the cluster. Each queue allocates a certain proportion of cluster resources. For example, there are three teams that share A pool of resources on A cluster: Team A uses up to 40% of the total cluster, Team B uses up to 30%, and Team C uses up to 30%. If the amount of work delivered exceeds the team's maximum available resources, there is a queue. + +## Scenario + +Proportion scheduling algorithm improves the flexibility and elasticity of cluster scheduling. The most typical scenario is that when multiple development teams in a company share a cluster, this scheduling algorithm can handle the requirements of shared resource matching and isolation between different departments very well. In multi-service mixed scenarios, such as computation-intensive AI business, network IO-intensive MPI and HPC business, and storage-intensive big data business, Proportion scheduling algorithm can allocate shared resources according to demand through matching. diff --git a/content/en/docs/sla.md b/content/en/docs/sla.md new file mode 100644 index 00000000..1103958a --- /dev/null +++ b/content/en/docs/sla.md @@ -0,0 +1,24 @@ ++++ +title = "SLA Plugin" + +date = 2021-05-13 +lastmod = 2021-05-13 + +draft = false # Is this a draft? true/false +toc = true # Show table of contents? true/false +type = "docs" # Do not modify. + +# Add menu entry to sidebar. +linktitle = "SLA" +[menu.docs] + parent = "plugins" + weight = 9 ++++ + +## Overview + +When users apply jobs to Volcano, they may need adding some particular constraints to job, for example, longest Pending time aiming to prevent job from starving. And these constraints can be regarded as Service Level Agreement (SLA) which are agreed between volcano and user. So sla plugin is provided to receive and realize SLA settings for both individual job and whole cluster. + +## Scenario + +Users can customize SLA related parameters in their own cluster according to business needs. For example, for clusters with high real-time service requirements, JobWaitingTime can be set as small as possible. For clusters with bulk computing jobs, JobWaitingTime can be set to larger. The parameters of a specific SLA and the optimization of the parameters need to be combined with the specific business and related performance measurement results. diff --git a/content/en/docs/task-topology.md b/content/en/docs/task-topology.md new file mode 100644 index 00000000..0c6867bd --- /dev/null +++ b/content/en/docs/task-topology.md @@ -0,0 +1,32 @@ ++++ +title = "Task-topology Plugin" + +date = 2021-05-13 +lastmod = 2021-05-13 + +draft = false # Is this a draft? true/false +toc = true # Show table of contents? true/false +type = "docs" # Do not modify. + +# Add menu entry to sidebar. +linktitle = "Task-topology" +[menu.docs] + parent = "plugins" + weight = 6 ++++ + +## Overview + +The task-topology algorithm is an algorithm that computes the priority of tasks and nodes based on the affinity and anti-affinity configuration between tasks within a Job. By configuring the affinity and anti-affinity policies between tasks within the Job and using the Task-Topology algorithm, tasks with affinity configurations can be scheduled to the same node first, and PODs with anti-affinity configurations to different nodes. + +## Scenario + +**Node affinity:** + +- Task-topology is important for improving computational efficiency in deep learning computing scenarios. Using the TensorFlow calculation as an example, configure the affinity between "ps" and "worker". Task-topology algorithm enables "ps" and "worker" to be scheduled to the same node as far as possible, so as to improve the efficiency of network and data interaction between "ps" and "worker", thus improving the computing efficiency. +- Tasks in HPC and MPI scenarios are highly synchronized and need high-speed network IO. + +**Anti-affinity:** + +- Take the TensorFlow calculation as an example, the anti-affinity between "ps" and "ps" +- Master and slave backup of e-commerce service scene, data disaster tolerant, to ensure that there are spare jobs to continue to provide service after a job fails. diff --git a/content/en/docs/tdm.md b/content/en/docs/tdm.md new file mode 100644 index 00000000..5d91dfd5 --- /dev/null +++ b/content/en/docs/tdm.md @@ -0,0 +1,24 @@ ++++ +title = "TDM Plugin" + +date = 2021-05-13 +lastmod = 2021-05-13 + +draft = false # Is this a draft? true/false +toc = true # Show table of contents? true/false +type = "docs" # Do not modify. + +# Add menu entry to sidebar. +linktitle = "TDM" +[menu.docs] + parent = "plugins" + weight = 10 ++++ + +## Overview + +The full name of TDM is Time Division Multiplexing. In a co-located environment, some nodes are in both Kubernetes cluster and Yarn cluster. For these nodes, Kubernetes and Yarn cluster can use these resource by time-sharing multiplexing.The TDM Plugin marks these nodes as `revocable nodes`. TDM plugin will try to dispatch `preemptable task` to `revocable node` in node revocable time and evict the `preemptable task` from `revocable node` out of revocable time.. TDM Plugin improves the time-division multiplexing ability of node resources in the scheduling process of Volcano. + +## Scenario + +In ToB business, cloud vendors provide cloud-based resources for merchants, and different merchants adopt different container arrangement frameworks (Kubernetes/YARN, etc.). TDM Plugin improves the time-sharing efficiency of common node resources and further improves the utilization rate of resources. From 94a18bf53467f3550af34c7797e510146eddad16 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 11 Nov 2025 13:00:57 +0000 Subject: [PATCH 3/4] Implement Plugins dropdown submenu with template modifications Co-authored-by: JesseStutler <38534065+JesseStutler@users.noreply.github.com> --- config/_default/menus.toml | 6 ++++ content/en/docs/plugins-overview.md | 7 ++--- .../layouts/partials/docs_sidebar.html | 30 +++++++++++++++++++ 3 files changed, 39 insertions(+), 4 deletions(-) diff --git a/config/_default/menus.toml b/config/_default/menus.toml index dee8a21a..32f65802 100644 --- a/config/_default/menus.toml +++ b/config/_default/menus.toml @@ -83,6 +83,12 @@ weight = 6 identifier = "scheduler" +[[docs]] + name = "Plugins" + parent = "scheduler" + weight = 3 + identifier = "plugins" + [[docs]] name = "CLI" weight = 7 diff --git a/content/en/docs/plugins-overview.md b/content/en/docs/plugins-overview.md index a2a5d3f9..70a1162f 100644 --- a/content/en/docs/plugins-overview.md +++ b/content/en/docs/plugins-overview.md @@ -8,11 +8,10 @@ draft = false # Is this a draft? true/false toc = false # Show table of contents? true/false type = "docs" # Do not modify. -linktitle = "Plugins" +linktitle = "Plugins Overview" [menu.docs] - parent = "scheduler" - weight = 3 - identifier = "plugins" + parent = "plugins" + weight = 0 +++ Volcano scheduler provides a rich set of plugins to support various scheduling scenarios. Each plugin implements specific scheduling algorithms and policies. diff --git a/themes/academic/layouts/partials/docs_sidebar.html b/themes/academic/layouts/partials/docs_sidebar.html index b756daea..3f1e7aa9 100644 --- a/themes/academic/layouts/partials/docs_sidebar.html +++ b/themes/academic/layouts/partials/docs_sidebar.html @@ -27,7 +27,21 @@ @@ -59,4 +73,20 @@ } } } + + function toggleSubmenu(event, identifier) { + event.preventDefault(); + event.stopPropagation(); + var submenu = document.getElementById('submenu-' + identifier); + var dot = document.getElementById('dot-' + identifier); + if (submenu.style.display === 'none' || submenu.style.display === '') { + submenu.style.display = 'block'; + dot.style.borderTopColor = 'rgba(0, 0, 0, .65)'; + dot.style.borderRightColor = '#fff'; + } else { + submenu.style.display = 'none'; + dot.style.borderRightColor = 'rgba(0, 0, 0, .65)'; + dot.style.borderTopColor = '#fff'; + } + } From 1d90ffe406fc7b4d4dbd27efff51fed49c758a20 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 11 Nov 2025 14:01:04 +0000 Subject: [PATCH 4/4] Add visual indentation to Scheduler submenu items Co-authored-by: JesseStutler <38534065+JesseStutler@users.noreply.github.com> --- config/_default/menus.toml | 6 ------ content/en/docs/plugins-overview.md | 7 ++++--- themes/academic/layouts/partials/docs_sidebar.html | 2 +- 3 files changed, 5 insertions(+), 10 deletions(-) diff --git a/config/_default/menus.toml b/config/_default/menus.toml index 32f65802..dee8a21a 100644 --- a/config/_default/menus.toml +++ b/config/_default/menus.toml @@ -83,12 +83,6 @@ weight = 6 identifier = "scheduler" -[[docs]] - name = "Plugins" - parent = "scheduler" - weight = 3 - identifier = "plugins" - [[docs]] name = "CLI" weight = 7 diff --git a/content/en/docs/plugins-overview.md b/content/en/docs/plugins-overview.md index 70a1162f..a2a5d3f9 100644 --- a/content/en/docs/plugins-overview.md +++ b/content/en/docs/plugins-overview.md @@ -8,10 +8,11 @@ draft = false # Is this a draft? true/false toc = false # Show table of contents? true/false type = "docs" # Do not modify. -linktitle = "Plugins Overview" +linktitle = "Plugins" [menu.docs] - parent = "plugins" - weight = 0 + parent = "scheduler" + weight = 3 + identifier = "plugins" +++ Volcano scheduler provides a rich set of plugins to support various scheduling scenarios. Each plugin implements specific scheduling algorithms and policies. diff --git a/themes/academic/layouts/partials/docs_sidebar.html b/themes/academic/layouts/partials/docs_sidebar.html index 3f1e7aa9..d2889d94 100644 --- a/themes/academic/layouts/partials/docs_sidebar.html +++ b/themes/academic/layouts/partials/docs_sidebar.html @@ -26,7 +26,7 @@ {{- if .HasChildren }}