Skip to content

Commit 2e7e2f9

Browse files
Merge branch 'GoogleCloudPlatform:main' into release-pr
2 parents 4cfa2bd + 285bbc5 commit 2e7e2f9

File tree

135 files changed

+2064
-55592
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

135 files changed

+2064
-55592
lines changed

cluster-toolkit-writers.json

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -388,5 +388,11 @@
388388
},
389389
{
390390
"login": "saara-tyagi27"
391+
},
392+
{
393+
"login": "mufaqam-gcl"
394+
},
395+
{
396+
"login": "akiki-liang0"
391397
}
392398
]

cmd/root.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ HPC deployments on the Google Cloud Platform.`,
5353
logging.Fatal("cmd.Help function failed: %s", err)
5454
}
5555
},
56-
Version: "v1.67.0",
56+
Version: "v1.68.0",
5757
Annotations: annotation,
5858
}
5959
)

community/examples/gke-tpu-v6/gke-tpu-v6-deployment.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,3 +55,9 @@ vars:
5555

5656
# The name of the compute engine reservation of TPU v6 nodes
5757
reservation:
58+
59+
# The disk size of system node pool for this deployment.
60+
system_node_pool_disk_size_gb:
61+
62+
# The disk size of v6e node pool for this deployment.
63+
v6e_node_pool_disk_size_gb:

community/examples/gke-tpu-v6/gke-tpu-v6.yaml

Lines changed: 84 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,13 +49,17 @@ vars:
4949
# The name of the compute engine reservation of TPU v6 nodes
5050
reservation:
5151

52+
system_node_pool_disk_size_gb: 200
53+
v6e_node_pool_disk_size_gb: 100
54+
5255

5356
deployment_groups:
5457
- group: primary
5558
modules:
56-
- id: gke-tpu-v6-net
59+
- id: gke-tpu-v6-net-0
5760
source: modules/network/vpc
5861
settings:
62+
network_name: $(vars.deployment_name)-net-0
5963
subnetworks:
6064
- subnet_name: $(vars.deployment_name)-sub-0
6165
subnet_region: $(vars.region)
@@ -69,24 +73,84 @@ deployment_groups:
6973
ip_cidr_range: 10.0.32.0/20
7074
firewall_rules:
7175
- name: $(vars.deployment_name)-internal-0
72-
ranges: [192.168.0.0/18]
76+
ranges: [192.168.0.0/16]
7377
allow:
7478
- protocol: tcp
7579
ports: ["0-65535"]
7680
- protocol: udp
7781
ports: ["0-65535"]
7882
- protocol: icmp
7983

84+
- id: gke-tpu-v6-net-1
85+
source: modules/network/vpc
86+
settings:
87+
network_name: $(vars.deployment_name)-net-1
88+
subnetworks:
89+
- subnet_name: $(vars.deployment_name)-sub-1
90+
subnet_region: $(vars.region)
91+
subnet_ip: 192.168.64.0/18
92+
firewall_rules:
93+
- name: $(vars.deployment_name)-internal-1
94+
ranges: [192.168.0.0/16]
95+
allow:
96+
- protocol: tcp
97+
ports: ["0-65535"]
98+
- protocol: udp
99+
ports: ["0-65535"]
100+
- protocol: icmp
101+
102+
- id: node_pool_service_account
103+
source: community/modules/project/service-account
104+
settings:
105+
name: gke-np-sa
106+
project_roles:
107+
- logging.logWriter
108+
- monitoring.metricWriter
109+
- monitoring.viewer
110+
- stackdriver.resourceMetadata.writer
111+
- storage.objectViewer
112+
- artifactregistry.reader
113+
114+
- id: workload_service_account
115+
source: community/modules/project/service-account
116+
settings:
117+
name: gke-wl-sa
118+
project_roles:
119+
- logging.logWriter
120+
- monitoring.metricWriter
121+
- monitoring.viewer
122+
- stackdriver.resourceMetadata.writer
123+
- storage.objectAdmin
124+
- artifactregistry.reader
125+
- container.admin
126+
80127
- id: gke-tpu-v6-cluster
81128
source: modules/scheduler/gke-cluster
82-
use: [gke-tpu-v6-net]
129+
use: [gke-tpu-v6-net-0, workload_service_account]
83130
settings:
84131
system_node_pool_machine_type: "n2-standard-8"
132+
system_node_pool_disk_size_gb: $(vars.system_node_pool_disk_size_gb)
85133
system_node_pool_taints: []
86134
enable_private_endpoint: false # Allows access from authorized public IPs
135+
configure_workload_identity_sa: true
87136
master_authorized_networks:
88137
- cidr_block: $(vars.authorized_cidr) # Allows your machine to run the kubectl command. Required for multi network setup.
89138
display_name: "kubectl-access-network"
139+
additional_networks:
140+
$(concat(
141+
[{
142+
network=gke-tpu-v6-net-1.network_name,
143+
subnetwork=gke-tpu-v6-net-1.subnetwork_name,
144+
subnetwork_project=vars.project_id,
145+
nic_type="GVNIC",
146+
queue_count=null,
147+
network_ip=null,
148+
stack_type=null,
149+
access_config=[{nat_ip=null, public_ptr_domain_name=null, network_tier=null}],
150+
ipv6_access_config=[],
151+
alias_ip_range=[]
152+
}]
153+
))
90154
# Cluster versions cannot be updated through the toolkit after creation
91155
# Please manage cluster version from the Google Cloud Console directly
92156
version_prefix: "1.32."
@@ -100,15 +164,31 @@ deployment_groups:
100164

101165
- id: gke-tpu-v6-pool
102166
source: modules/compute/gke-node-pool
103-
use: [gke-tpu-v6-cluster]
167+
use: [gke-tpu-v6-cluster, node_pool_service_account]
104168
settings:
105169
num_slices: $(vars.num_slices)
106170
name: gke-tpu-v6-pool
107171
disk_type: hyperdisk-balanced
108172
machine_type: $(vars.machine_type)
109173
auto_upgrade: true
110174
zones: [$(vars.zone)]
175+
disk_size_gb: $(vars.v6e_node_pool_disk_size_gb)
111176
static_node_count: $(vars.static_node_count)
177+
additional_networks:
178+
$(concat(
179+
[{
180+
network=gke-tpu-v6-net-1.network_name,
181+
subnetwork=gke-tpu-v6-net-1.subnetwork_name,
182+
subnetwork_project=vars.project_id,
183+
nic_type="GVNIC",
184+
queue_count=null,
185+
network_ip=null,
186+
stack_type=null,
187+
access_config=[{nat_ip=null, public_ptr_domain_name=null, network_tier=null}],
188+
ipv6_access_config=[],
189+
alias_ip_range=[]
190+
}]
191+
))
112192
reservation_affinity:
113193
consume_reservation_type: SPECIFIC_RESERVATION
114194
specific_reservations:
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# cgroup.conf
2+
# https://slurm.schedmd.com/cgroup.conf.html
3+
4+
CgroupPlugin=autodetect
5+
IgnoreSystemd=yes
6+
# EnableControllers=yes
7+
ConstrainCores=yes
8+
ConstrainRamSpace=yes
9+
ConstrainSwapSpace=no
10+
ConstrainDevices=yes
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# Copyright 2025 "Google LLC"
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
apiVersion: v1
16+
kind: Namespace
17+
metadata:
18+
name: ${namespace}
Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
---
15+
16+
blueprint_name: slurm-gke
17+
18+
vars:
19+
# The following variables should be over-written in the deployment.yaml file.
20+
# Your GCP Project ID
21+
project_id: ## Set GCP Project ID Here ##
22+
23+
# This should be unique across all of your Cluster
24+
# Toolkit Deployments.
25+
deployment_name: slurmgke
26+
# The GCP Region used for this deployment.
27+
region:
28+
29+
# The GCP Zone used for this deployment.
30+
zone:
31+
32+
# Cidr block containing the IP of the machine calling terraform.
33+
# To allow all (IAM restrictions still enforced), use 0.0.0.0/0
34+
# To allow only your IP address, use <YOUR-IP-ADDRESS>/32
35+
authorized_cidr:
36+
37+
# The number of nodes to be created for the Slurm GKE NodeSet.
38+
gke_nodeset_replicas: 2
39+
40+
41+
# The pre-built Slinky Image for GKE Nodeset.
42+
# Follow instruction in ./images/containers to build this image.
43+
slinky_image: ghcr.io/slinkyproject/slurmd-pyxis:24.11-ubuntu24.04
44+
45+
# Namespace where Slurm GKE NodeSet will be created
46+
slurm_namespace: slurm
47+
48+
deployment_groups:
49+
- group: primary
50+
modules:
51+
52+
###### Common resources ######
53+
54+
- id: network
55+
source: modules/network/vpc
56+
settings:
57+
subnetwork_name: $(vars.deployment_name)-subnet
58+
secondary_ranges_list:
59+
- subnetwork_name: $(vars.deployment_name)-subnet
60+
ranges:
61+
- range_name: pods
62+
ip_cidr_range: 10.4.0.0/14
63+
- range_name: services
64+
ip_cidr_range: 10.0.32.0/20
65+
66+
- id: private_service_access
67+
source: community/modules/network/private-service-access
68+
use: [network]
69+
70+
- id: homefs
71+
source: modules/file-system/filestore
72+
use: [network, private_service_access]
73+
settings:
74+
local_mount: /home
75+
76+
###### GKE Setup ######
77+
78+
- id: gke_service_account
79+
source: community/modules/project/service-account
80+
settings:
81+
name: slinky-gke-sa
82+
project_roles:
83+
- logging.logWriter
84+
- monitoring.metricWriter
85+
- monitoring.viewer
86+
- stackdriver.resourceMetadata.writer
87+
- storage.objectAdmin
88+
- artifactregistry.reader
89+
90+
- id: gke_cluster
91+
source: modules/scheduler/gke-cluster
92+
use: [network, gke_service_account]
93+
settings:
94+
enable_private_endpoint: false
95+
enable_gcsfuse_csi: true
96+
enable_filestore_csi: true
97+
master_authorized_networks:
98+
- cidr_block: $(vars.authorized_cidr) # Allows your machine to run the kubectl command. Required for multi network setup.
99+
display_name: "kubectl-access-network"
100+
system_node_pool_enabled: false
101+
configure_workload_identity_sa: true
102+
enable_dcgm_monitoring: true
103+
outputs: [instructions]
104+
105+
- id: gke_base_pool
106+
source: modules/compute/gke-node-pool
107+
use: [gke_cluster, gke_service_account]
108+
settings:
109+
initial_node_count: 1
110+
disk_type: pd-balanced
111+
machine_type: e2-standard-4
112+
zones: [$(vars.zone)]
113+
114+
- id: gke_compute_pool
115+
source: modules/compute/gke-node-pool
116+
use: [gke_cluster, gke_service_account]
117+
settings:
118+
name: gke-compute-pool
119+
initial_node_count: $(vars.gke_nodeset_replicas)
120+
disk_type: pd-balanced
121+
machine_type: c2-standard-16
122+
zones: [$(vars.zone)]
123+
124+
- id: gke_ns_manifest
125+
source: modules/management/kubectl-apply
126+
use: [gke_cluster]
127+
settings:
128+
apply_manifests:
129+
- source: $(ghpc_stage("./files/slurm-namespace.yaml.tftpl"))
130+
template_vars:
131+
namespace: $(vars.slurm_namespace)
132+
133+
- id: slinky
134+
source: community/modules/scheduler/slinky
135+
use:
136+
- gke_cluster
137+
- gke_base_pool # Optionally specify nodepool(s) to avoid operator components running on HPC hardware
138+
settings:
139+
slurm_operator_namespace: $(vars.slurm_namespace)
140+
install_slurm_operator_chart: true
141+
install_slurm_chart: false
142+
143+
- id: gke_compute_nodeset
144+
source: community/modules/compute/gke-nodeset
145+
use: [gke_compute_pool, slinky, homefs, slurm_controller, network]
146+
settings:
147+
slurm_cluster_name: $(vars.deployment_name)
148+
image: $(vars.slinky_image)
149+
150+
- id: gke_compute_partition
151+
source: community/modules/compute/gke-partition
152+
use: [slurm_controller, gke_compute_nodeset]
153+
154+
###### GCE Setup ######
155+
156+
- id: debug_nodeset
157+
source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset
158+
use: [network]
159+
settings:
160+
node_count_dynamic_max: 4
161+
machine_type: n2-standard-2
162+
allow_automatic_updates: false
163+
164+
- id: debug_partition
165+
source: community/modules/compute/schedmd-slurm-gcp-v6-partition
166+
use:
167+
- debug_nodeset
168+
settings:
169+
partition_name: debug
170+
exclusive: false # allows nodes to stay up after jobs are done
171+
is_default: true
172+
suspend_time: -1 # prevents nodes from suspending while it's idle
173+
174+
- id: slurm_login
175+
source: community/modules/scheduler/schedmd-slurm-gcp-v6-login
176+
use: [network]
177+
settings:
178+
machine_type: n2-standard-4
179+
enable_login_public_ips: true
180+
181+
- id: slurm_controller
182+
source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller
183+
use:
184+
- network
185+
- slurm_login
186+
- debug_partition
187+
- homefs
188+
settings:
189+
slurm_cluster_name: $(vars.deployment_name)
190+
enable_slurm_auth: true
191+
cgroup_conf_tpl: $(ghpc_stage("./files/cgroup.conf.tpl"))
192+
enable_controller_public_ips: true

0 commit comments

Comments
 (0)