Skip to content

Commit 3c69786

Browse files
authored
Merge pull request #3862 from ndebuhr/feat/slinky-scheduler
Create a new community scheduler module for Slinky (Slurm on Kubernetes)
2 parents db5d854 + b728e07 commit 3c69786

File tree

14 files changed

+858
-1
lines changed

14 files changed

+858
-1
lines changed
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Copyright 2025 "Google LLC"
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
apiVersion: monitoring.googleapis.com/v1
16+
kind: PodMonitoring
17+
metadata:
18+
name: slurm-exporter
19+
namespace: slurm
20+
spec:
21+
selector:
22+
matchLabels:
23+
app.kubernetes.io/instance: slurm-exporter
24+
app.kubernetes.io/name: slurm-exporter
25+
endpoints:
26+
- port: metrics
27+
interval: 30s
Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
---
15+
16+
blueprint_name: hpc-slinky
17+
18+
vars:
19+
project_id: ## Set GCP Project ID Here ##
20+
deployment_name: slinky-01
21+
region: us-central1
22+
zones:
23+
- us-central1-a
24+
authorized_cidr: # <your-ip-address>/32
25+
gcp_public_cidrs_access_enabled: false
26+
exporter_pod_monitoring_path: $(ghpc_stage("./exporter-pod-monitoring.yaml"))
27+
debug_nodeset_replicas: 2
28+
h3_nodeset_replicas: 2
29+
30+
deployment_groups:
31+
- group: primary
32+
modules:
33+
- id: network
34+
source: modules/network/vpc
35+
settings:
36+
subnetwork_name: $(vars.deployment_name)-subnet
37+
secondary_ranges_list:
38+
- subnetwork_name: $(vars.deployment_name)-subnet
39+
ranges:
40+
- range_name: pods
41+
ip_cidr_range: 10.4.0.0/14
42+
- range_name: services
43+
ip_cidr_range: 10.0.32.0/20
44+
45+
- id: node_pool_service_account
46+
source: community/modules/project/service-account
47+
settings:
48+
name: gke-np-sa
49+
project_roles:
50+
- logging.logWriter
51+
- monitoring.metricWriter
52+
- monitoring.viewer
53+
- stackdriver.resourceMetadata.writer
54+
- storage.objectAdmin
55+
- artifactregistry.reader
56+
57+
- id: workload_service_account
58+
source: community/modules/project/service-account
59+
settings:
60+
name: gke-wl-sa
61+
project_roles:
62+
- logging.logWriter
63+
- monitoring.metricWriter
64+
- monitoring.viewer
65+
- stackdriver.resourceMetadata.writer
66+
- storage.objectAdmin
67+
- artifactregistry.reader
68+
69+
- id: gke_cluster
70+
source: modules/scheduler/gke-cluster
71+
use: [network, workload_service_account]
72+
settings:
73+
enable_private_endpoint: false
74+
gcp_public_cidrs_access_enabled: $(vars.gcp_public_cidrs_access_enabled)
75+
master_authorized_networks:
76+
- display_name: deployment-machine
77+
cidr_block: $(vars.authorized_cidr)
78+
system_node_pool_enabled: false
79+
configure_workload_identity_sa: true
80+
outputs: [instructions]
81+
82+
- id: base_pool
83+
source: modules/compute/gke-node-pool
84+
use: [gke_cluster, node_pool_service_account]
85+
settings:
86+
initial_node_count: 1
87+
disk_type: pd-balanced
88+
machine_type: e2-standard-4
89+
zones: $(vars.zones)
90+
91+
- id: h3_pool
92+
source: modules/compute/gke-node-pool
93+
use: [gke_cluster, node_pool_service_account]
94+
settings:
95+
initial_node_count: 2
96+
disk_type: pd-balanced
97+
machine_type: h3-standard-88
98+
zones: $(vars.zones)
99+
100+
- id: slinky
101+
source: community/modules/scheduler/slinky
102+
use:
103+
- gke_cluster
104+
- base_pool # Optionally specify nodepool(s) to avoid operator components running on HPC hardware
105+
settings:
106+
slurm_values:
107+
compute:
108+
nodesets:
109+
- name: debug
110+
enabled: true
111+
replicas: $(vars.debug_nodeset_replicas)
112+
image:
113+
# Use the default nodeset image
114+
repository: ""
115+
tag: ""
116+
resources:
117+
requests:
118+
cpu: 500m
119+
memory: 4Gi
120+
limits:
121+
cpu: 500m
122+
memory: 4Gi
123+
affinity:
124+
nodeAffinity:
125+
requiredDuringSchedulingIgnoredDuringExecution:
126+
nodeSelectorTerms:
127+
- matchExpressions:
128+
- key: "node.kubernetes.io/instance-type"
129+
operator: In
130+
values:
131+
- e2-standard-4
132+
partition:
133+
enabled: true
134+
- name: h3
135+
enabled: true
136+
replicas: $(vars.h3_nodeset_replicas)
137+
image:
138+
# Use the default nodeset image
139+
repository: ""
140+
tag: ""
141+
resources:
142+
requests:
143+
cpu: 86
144+
memory: 324Gi
145+
limits:
146+
cpu: 86
147+
memory: 324Gi
148+
affinity:
149+
nodeAffinity:
150+
requiredDuringSchedulingIgnoredDuringExecution:
151+
nodeSelectorTerms:
152+
- matchExpressions:
153+
- key: "node.kubernetes.io/instance-type"
154+
operator: In
155+
values:
156+
- h3-standard-88
157+
partition:
158+
enabled: true
159+
outputs: [instructions]
160+
161+
- id: slurm_exporter_monitoring
162+
source: modules/management/kubectl-apply
163+
use: [gke_cluster]
164+
settings:
165+
apply_manifests:
166+
- source: $(vars.exporter_pod_monitoring_path)
Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
## Description
2+
3+
This module creates a [Slinky](https://slinky.ai) cluster and nodesets, for a [Slurm](https://slurm.schedmd.com/documentation.html)-on-Kubernetes HPC setup.
4+
5+
The setup closely follows the [documented quickstart installation](https://github.com/SlinkyProject/slurm-operator/blob/main/docs/quickstart.md), with the exception of a more lightweight monitoring/metrics setup. Consider scraping the Slurm Exporter with [Google Managed Prometheus](https://cloud.google.com/stackdriver/docs/managed-prometheus) and a [PodMonitoring resource](https://cloud.google.com/stackdriver/docs/managed-prometheus/setup-managed#gmp-pod-monitoring), rather than a cluster-local Kube Prometheus Stack (although both are possible with module parameterizations).
6+
7+
Through `cert_manager_values`, `prometheus_values`, `slurm_operator_values`, and `slurm_values`, you can customize the Helm releases that constitute Slinky. The Cert Manager, Slurm Operator, and Slurm Helm installations are required, whereas the Prometheus Helm chart is optional (and not included by default). Set `install_kube_prometheus_stack=true` to install Prometheus.
8+
9+
### Example
10+
11+
```yaml
12+
- id: slinky
13+
source: community/modules/scheduler/slinky
14+
use: [gke_cluster, base_pool]
15+
settings:
16+
slurm_values:
17+
compute:
18+
nodesets:
19+
- name: h3
20+
enabled: true
21+
replicas: 2
22+
image:
23+
# Use the default nodeset image
24+
repository: ""
25+
tag: ""
26+
resources:
27+
requests:
28+
cpu: 86
29+
memory: 324Gi
30+
limits:
31+
cpu: 86
32+
memory: 324Gi
33+
affinity:
34+
nodeAffinity:
35+
requiredDuringSchedulingIgnoredDuringExecution:
36+
nodeSelectorTerms:
37+
- matchExpressions:
38+
- key: "node.kubernetes.io/instance-type"
39+
operator: In
40+
values:
41+
- h3-standard-88
42+
partition:
43+
enabled: true
44+
```
45+
46+
This creates a Slinky cluster with the following attributes:
47+
48+
* Slinky Helm releases are installed atop the `gke_cluster` (from the `gke-cluster` module).
49+
* Slinky system components are scheduled on the `base_pool` (from the `gke-node-pool` module).
50+
* This node affinity specification is recommended, to save HPC hardware for HPC nodesets, and to ensure Helm releases are fully uninstalled before all nodepools are deleted during a `gcluster destroy`.
51+
* One Slurm nodeset is provisioned, with resource requests/limits and node affinities aligned to h3-standard-88 VMs.
52+
53+
### Usage
54+
55+
To test Slurm functionality, connect to the controller and use Slurm client commands:
56+
57+
```bash
58+
gcloud container clusters get-credentials YOUR_CLUSTER --region YOUR_REGION
59+
```
60+
61+
```bash
62+
kubectl exec -it statefulsets/slurm-controller \
63+
--namespace=slurm \
64+
-- bash --login
65+
```
66+
67+
On the controller pod (e.g. host slurm@slurm-controller-0), run the following commands to quickly test if Slurm is functioning:
68+
69+
```bash
70+
sinfo
71+
srun hostname
72+
sbatch --wrap="sleep 60"
73+
squeue
74+
```
75+
76+
<!-- BEGINNING OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
77+
## Requirements
78+
79+
| Name | Version |
80+
|------|---------|
81+
| <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | >= 1.3 |
82+
| <a name="requirement_google"></a> [google](#requirement\_google) | >= 6.16 |
83+
| <a name="requirement_helm"></a> [helm](#requirement\_helm) | ~> 2.17 |
84+
85+
## Providers
86+
87+
| Name | Version |
88+
|------|---------|
89+
| <a name="provider_google"></a> [google](#provider\_google) | >= 6.16 |
90+
| <a name="provider_helm"></a> [helm](#provider\_helm) | ~> 2.17 |
91+
92+
## Modules
93+
94+
No modules.
95+
96+
## Resources
97+
98+
| Name | Type |
99+
|------|------|
100+
| [helm_release.cert_manager](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource |
101+
| [helm_release.prometheus](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource |
102+
| [helm_release.slurm](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource |
103+
| [helm_release.slurm_operator](https://registry.terraform.io/providers/hashicorp/helm/latest/docs/resources/release) | resource |
104+
| [google_client_config.default](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/client_config) | data source |
105+
| [google_container_cluster.gke_cluster](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/container_cluster) | data source |
106+
107+
## Inputs
108+
109+
| Name | Description | Type | Default | Required |
110+
|------|-------------|------|---------|:--------:|
111+
| <a name="input_cert_manager_chart_version"></a> [cert\_manager\_chart\_version](#input\_cert\_manager\_chart\_version) | Version of the Cert Manager chart to install. | `string` | `"v1.17.1"` | no |
112+
| <a name="input_cert_manager_values"></a> [cert\_manager\_values](#input\_cert\_manager\_values) | Value overrides for the Cert Manager release | `any` | <pre>{<br/> "crds": {<br/> "enabled": true<br/> }<br/>}</pre> | no |
113+
| <a name="input_cluster_id"></a> [cluster\_id](#input\_cluster\_id) | An identifier for the GKE cluster resource with format projects/<project\_id>/locations/<region>/clusters/<name>. | `string` | n/a | yes |
114+
| <a name="input_install_kube_prometheus_stack"></a> [install\_kube\_prometheus\_stack](#input\_install\_kube\_prometheus\_stack) | Install the Kube Prometheus Stack. | `bool` | `false` | no |
115+
| <a name="input_node_pool_names"></a> [node\_pool\_names](#input\_node\_pool\_names) | Names of node pools, for use in node affinities (Slinky system components). | `list(string)` | `null` | no |
116+
| <a name="input_project_id"></a> [project\_id](#input\_project\_id) | The project ID that hosts the GKE cluster. | `string` | n/a | yes |
117+
| <a name="input_prometheus_chart_version"></a> [prometheus\_chart\_version](#input\_prometheus\_chart\_version) | Version of the Kube Prometheus Stack chart to install. | `string` | `"70.4.1"` | no |
118+
| <a name="input_prometheus_values"></a> [prometheus\_values](#input\_prometheus\_values) | Value overrides for the Prometheus release | `any` | <pre>{<br/> "installCRDs": true<br/>}</pre> | no |
119+
| <a name="input_slurm_chart_version"></a> [slurm\_chart\_version](#input\_slurm\_chart\_version) | Version of the Slurm chart to install. | `string` | `"0.2.1"` | no |
120+
| <a name="input_slurm_operator_chart_version"></a> [slurm\_operator\_chart\_version](#input\_slurm\_operator\_chart\_version) | Version of the Slurm Operator chart to install. | `string` | `"0.2.1"` | no |
121+
| <a name="input_slurm_operator_values"></a> [slurm\_operator\_values](#input\_slurm\_operator\_values) | Value overrides for the Slinky release | `any` | `{}` | no |
122+
| <a name="input_slurm_values"></a> [slurm\_values](#input\_slurm\_values) | Value overrides for the Slurm release | `any` | `{}` | no |
123+
124+
## Outputs
125+
126+
| Name | Description |
127+
|------|-------------|
128+
| <a name="output_instructions"></a> [instructions](#output\_instructions) | Post deployment instructions. |
129+
<!-- END OF PRE-COMMIT-TERRAFORM DOCS HOOK -->

0 commit comments

Comments
 (0)