|
| 1 | +# Copyright 2025 Google LLC |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +# you may not use this file except in compliance with the License. |
| 5 | +# You may obtain a copy of the License at |
| 6 | +# |
| 7 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +# |
| 9 | +# Unless required by applicable law or agreed to in writing, software |
| 10 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +# See the License for the specific language governing permissions and |
| 13 | +# limitations under the License. |
| 14 | +--- |
| 15 | +blueprint_name: slurm-h4d |
| 16 | +vars: |
| 17 | + project_id: ## Set GCP Project ID Here ## |
| 18 | + deployment_name: slurm-h4d |
| 19 | + region: us-central1 |
| 20 | + zone: us-central1-a |
| 21 | + rdma_net_range: 192.168.128.0/18 |
| 22 | + |
| 23 | +# Documentation for each of the modules used below can be found at |
| 24 | +# https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/modules/README.md |
| 25 | +deployment_groups: |
| 26 | +- group: primary |
| 27 | + modules: |
| 28 | + |
| 29 | + # Source is an embedded module, denoted by "modules/*" without ./, ../, / |
| 30 | + # as a prefix. To refer to a local module, prefix with ./, ../ or / |
| 31 | + |
| 32 | + - id: h4d-slurm-net-0 |
| 33 | + source: modules/network/vpc |
| 34 | + |
| 35 | + - id: h4d-rdma-net |
| 36 | + source: modules/network/vpc |
| 37 | + settings: |
| 38 | + network_name: $(vars.deployment_name)-rdma-net-0 |
| 39 | + mtu: 8896 |
| 40 | + network_profile: https://www.googleapis.com/compute/beta/projects/$(vars.project_id)/global/networkProfiles/$(vars.zone)-vpc-falcon |
| 41 | + network_routing_mode: REGIONAL |
| 42 | + enable_cloud_router: false |
| 43 | + enable_cloud_nat: false |
| 44 | + subnetworks: |
| 45 | + - subnet_name: $(vars.deployment_name)-rdma-sub-0 |
| 46 | + subnet_region: $(vars.region) |
| 47 | + subnet_ip: $(vars.rdma_net_range) |
| 48 | + region: $(vars.region) |
| 49 | + firewall_rules: |
| 50 | + - name: $(vars.deployment_name)-rdma-0 |
| 51 | + ranges: [$(vars.rdma_net_range)] |
| 52 | + allow: |
| 53 | + - protocol: tcp |
| 54 | + - protocol: udp |
| 55 | + - protocol: icmp |
| 56 | + |
| 57 | + - id: homefs |
| 58 | + source: modules/file-system/filestore |
| 59 | + use: [h4d-slurm-net-0] |
| 60 | + settings: |
| 61 | + filestore_tier: BASIC_SSD |
| 62 | + size_gb: 2560 |
| 63 | + filestore_share_name: homeshare |
| 64 | + local_mount: /home |
| 65 | + |
| 66 | + - id: appsfs |
| 67 | + source: modules/file-system/filestore |
| 68 | + use: [h4d-slurm-net-0] |
| 69 | + settings: |
| 70 | + filestore_tier: BASIC_SSD |
| 71 | + size_gb: 2560 |
| 72 | + filestore_share_name: appsshare |
| 73 | + local_mount: /apps |
| 74 | + |
| 75 | + - id: h4d_startup |
| 76 | + source: modules/scripts/startup-script |
| 77 | + settings: |
| 78 | + install_cloud_rdma_drivers: true |
| 79 | + set_ofi_cloud_rdma_tunables: true |
| 80 | + local_ssd_filesystem: |
| 81 | + fs_type: ext4 |
| 82 | + mountpoint: /mnt/lssd |
| 83 | + permissions: "1777" |
| 84 | + |
| 85 | + - id: h4d_nodeset |
| 86 | + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset |
| 87 | + use: [h4d_startup, h4d-slurm-net-0] |
| 88 | + settings: |
| 89 | + bandwidth_tier: gvnic_enabled |
| 90 | + machine_type: h4d-highmem-192-lssd |
| 91 | + node_count_static: 2 |
| 92 | + node_count_dynamic_max: 0 |
| 93 | + enable_placement: false |
| 94 | + disk_type: hyperdisk-balanced |
| 95 | + on_host_maintenance: TERMINATE |
| 96 | + additional_networks: |
| 97 | + $(concat( |
| 98 | + [{ |
| 99 | + network=null, |
| 100 | + subnetwork=h4d-rdma-net.subnetwork_self_link, |
| 101 | + subnetwork_project=vars.project_id, |
| 102 | + nic_type="IRDMA", |
| 103 | + queue_count=null, |
| 104 | + network_ip=null, |
| 105 | + stack_type=null, |
| 106 | + access_config=null, |
| 107 | + ipv6_access_config=[], |
| 108 | + alias_ip_range=[] |
| 109 | + }] |
| 110 | + )) |
| 111 | + |
| 112 | + - id: h4d_partition |
| 113 | + source: community/modules/compute/schedmd-slurm-gcp-v6-partition |
| 114 | + use: |
| 115 | + - h4d_nodeset |
| 116 | + settings: |
| 117 | + exclusive: false |
| 118 | + partition_name: h4d |
| 119 | + is_default: true |
| 120 | + partition_conf: |
| 121 | + ResumeTimeout: 900 |
| 122 | + SuspendTimeout: 600 |
| 123 | + |
| 124 | + - id: slurm_login |
| 125 | + source: community/modules/scheduler/schedmd-slurm-gcp-v6-login |
| 126 | + use: [h4d-slurm-net-0] |
| 127 | + settings: |
| 128 | + machine_type: n2-standard-4 |
| 129 | + enable_login_public_ips: true |
| 130 | + |
| 131 | + - id: slurm_controller_startup |
| 132 | + source: modules/scripts/startup-script |
| 133 | + settings: |
| 134 | + set_ofi_cloud_rdma_tunables: true |
| 135 | + |
| 136 | + - id: slurm_controller |
| 137 | + source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller |
| 138 | + use: [h4d-slurm-net-0, h4d_partition, slurm_login, homefs, appsfs] |
| 139 | + settings: |
| 140 | + enable_controller_public_ips: true |
| 141 | + controller_startup_script: $(slurm_controller_startup.startup_script) |
0 commit comments