Skip to content

Commit e28dd7f

Browse files
committed
reverting per_zone var changes to provider defaults and added validations
Change-Id: I46912057ac6fb59dc1f106eaf01cc73db7a97432
1 parent fac6ef3 commit e28dd7f

File tree

9 files changed

+38
-36
lines changed

9 files changed

+38
-36
lines changed

examples/gke-consumption-options/dws-flex-start/gke-tpu-7x/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@ These steps guide you through the cluster creation process for TPUs using DWS Fl
1111
`region`: the compute region for the cluster.
1212
`zone`: the compute zone for the node pool of TPU 7x machines.
1313
**`enable_flex_start`**: set to `true` to enable DWS Flex Start.
14-
**`autoscaling_min_node_count_per_zone`**: set to `0` (required for Flex Start).
15-
**`autoscaling_max_node_count_per_zone`**: set to the required node count for your topology (e.g., `2` for a `2x2x2` topology).
14+
**`autoscaling_min_node_count`**: set to `0` (required for Flex Start).
15+
**`autoscaling_max_node_count`**: set to the required node count for your topology (e.g., `2` for a `2x2x2` topology).
1616
`authorized_cidr`: The IP address range that you want to allow to connect with the cluster.
1717
To modify advanced settings, edit `examples/gke-consumption-options/dws-flex-start/gke-tpu-7x/gke-tpu-7x.yaml`.
1818

examples/gke-consumption-options/dws-flex-start/gke-tpu-7x/gke-tpu-7x-deployment.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,8 @@ vars:
5151
# num chips for the machine type (e.g. 4 for tpu7x-standard-4t).
5252
# Reference: https://cloud.google.com/tpu/docs/v6e
5353
enable_flex_start: true
54-
autoscaling_min_node_count_per_zone: 0
55-
autoscaling_max_node_count_per_zone: 2
54+
autoscaling_min_node_count: 0
55+
autoscaling_max_node_count: 2
5656

5757
# Cidr block containing the IP of the machine calling terraform.
5858
# To allow all (IAM restrictions still enforced), use 0.0.0.0/0

examples/gke-consumption-options/dws-flex-start/gke-tpu-7x/gke-tpu-7x.yaml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,8 @@ vars:
4545

4646
# --- FLEX START SETTINGS ---
4747
enable_flex_start: true
48-
autoscaling_min_node_count_per_zone:
49-
autoscaling_max_node_count_per_zone:
48+
autoscaling_min_node_count:
49+
autoscaling_max_node_count:
5050

5151
# Kueue configuration
5252
kueue_configuration_path: $(ghpc_stage("./kueue-configuration.yaml.tftpl"))
@@ -200,8 +200,8 @@ deployment_groups:
200200
zones: [$(vars.zone)]
201201
enable_flex_start: $(vars.enable_flex_start)
202202
auto_repair: false # flex-start dependency
203-
autoscaling_min_node_count_per_zone: $(vars.autoscaling_min_node_count_per_zone)
204-
autoscaling_max_node_count_per_zone: $(vars.autoscaling_max_node_count_per_zone)
203+
autoscaling_min_node_count: $(vars.autoscaling_min_node_count)
204+
autoscaling_max_node_count: $(vars.autoscaling_max_node_count)
205205
additional_networks:
206206
$(concat(
207207
[{
@@ -227,7 +227,7 @@ deployment_groups:
227227
install: true
228228
config_path: $(vars.kueue_configuration_path)
229229
config_template_vars:
230-
tpu_quota: $(vars.num_slices * vars.autoscaling_max_node_count_per_zone * gke-tpu-7x-pool.tpu_chips_per_node)
230+
tpu_quota: $(vars.num_slices * vars.autoscaling_max_node_count * gke-tpu-7x-pool.tpu_chips_per_node)
231231
accelerator_type: $(gke-tpu-7x-pool.tpu_accelerator_type)
232232
jobset:
233233
install: true
@@ -270,7 +270,7 @@ deployment_groups:
270270
source: modules/compute/gke-job-template
271271
use: [checkpointing-pv, training-pv, gke-tpu-7x-pool]
272272
settings:
273-
node_count: $(vars.autoscaling_max_node_count_per_zone)
273+
node_count: $(vars.autoscaling_max_node_count)
274274
security_context: # to make sure the job have enough access to install the fio packages
275275
- key: runAsUser
276276
value: 0

examples/gke-consumption-options/dws-flex-start/gke-tpu-v6e/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@ These steps guide you through the cluster creation process for TPUs using DWS Fl
1111
`region`: the compute region for the cluster.
1212
`zone`: the compute zone for the node pool of TPU v6e machines.
1313
**`enable_flex_start`**: set to `true` to enable DWS Flex Start.
14-
**`autoscaling_min_node_count_per_zone`**: set to `0` (required for Flex Start).
15-
**`autoscaling_max_node_count_per_zone`**: set to the required node count for your topology (e.g., `4` for a `4x4` topology).
14+
**`autoscaling_min_node_count`**: set to `0` (required for Flex Start).
15+
**`autoscaling_max_node_count`**: set to the required node count for your topology (e.g., `4` for a `4x4` topology).
1616
`authorized_cidr`: The IP address range that you want to allow to connect with the cluster.
1717
`system_node_pool_disk_size_gb`: the size of disk for each node of the system node pool.
1818
`v6e_node_pool_disk_size_gb`: the size of disk for each node of the TPU node pool.

examples/gke-consumption-options/dws-flex-start/gke-tpu-v6e/gke-tpu-v6e-deployment.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,8 @@ vars:
4949
# num chips for the machine type (e.g. 4 for ct6e-standard-4t).
5050
# Reference: https://cloud.google.com/tpu/docs/v6e
5151
enable_flex_start: true
52-
autoscaling_min_node_count_per_zone: 0
53-
autoscaling_max_node_count_per_zone: 4
52+
autoscaling_min_node_count: 0
53+
autoscaling_max_node_count: 4
5454

5555
# Cidr block containing the IP of the machine calling terraform.
5656
# To allow all (IAM restrictions still enforced), use 0.0.0.0/0

examples/gke-consumption-options/dws-flex-start/gke-tpu-v6e/gke-tpu-v6e.yaml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,8 @@ vars:
4949

5050
# --- FLEX START SETTINGS ---
5151
enable_flex_start: true
52-
autoscaling_min_node_count_per_zone:
53-
autoscaling_max_node_count_per_zone:
52+
autoscaling_min_node_count:
53+
autoscaling_max_node_count:
5454

5555
# Kueue configuration
5656
kueue_configuration_path: $(ghpc_stage("./kueue-configuration.yaml.tftpl"))
@@ -196,8 +196,8 @@ deployment_groups:
196196
disk_size_gb: $(vars.v6e_node_pool_disk_size_gb)
197197
enable_flex_start: $(vars.enable_flex_start)
198198
auto_repair: false # flex-start dependency
199-
autoscaling_min_node_count_per_zone: $(vars.autoscaling_min_node_count_per_zone)
200-
autoscaling_max_node_count_per_zone: $(vars.autoscaling_max_node_count_per_zone)
199+
autoscaling_min_node_count: $(vars.autoscaling_min_node_count)
200+
autoscaling_max_node_count: $(vars.autoscaling_max_node_count)
201201
additional_networks:
202202
$(concat(
203203
[{
@@ -226,7 +226,7 @@ deployment_groups:
226226
install: true
227227
config_path: $(vars.kueue_configuration_path)
228228
config_template_vars:
229-
tpu_quota: $(vars.num_slices * vars.autoscaling_max_node_count_per_zone * gke-tpu-v6-pool.tpu_chips_per_node)
229+
tpu_quota: $(vars.num_slices * vars.autoscaling_max_node_count * gke-tpu-v6-pool.tpu_chips_per_node)
230230
accelerator_type: $(gke-tpu-v6-pool.tpu_accelerator_type)
231231
jobset:
232232
install: true
@@ -269,7 +269,7 @@ deployment_groups:
269269
source: modules/compute/gke-job-template
270270
use: [checkpointing-pv, training-pv, gke-tpu-v6-pool]
271271
settings:
272-
node_count: $(vars.autoscaling_max_node_count_per_zone)
272+
node_count: $(vars.autoscaling_max_node_count)
273273
security_context: # to make sure the job have enough access to install the fio packages
274274
- key: runAsUser
275275
value: 0

modules/compute/gke-node-pool/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -319,8 +319,8 @@ limitations under the License.
319319
| <a name="input_additional_networks"></a> [additional\_networks](#input\_additional\_networks) | Additional network interface details for GKE, if any. Providing additional networks adds additional node networks to the node pool | <pre>list(object({<br/> network = string<br/> subnetwork = string<br/> subnetwork_project = string<br/> network_ip = string<br/> nic_type = string<br/> stack_type = string<br/> queue_count = number<br/> access_config = list(object({<br/> nat_ip = string<br/> network_tier = string<br/> }))<br/> ipv6_access_config = list(object({<br/> network_tier = string<br/> }))<br/> alias_ip_range = list(object({<br/> ip_cidr_range = string<br/> subnetwork_range_name = string<br/> }))<br/> }))</pre> | `[]` | no |
320320
| <a name="input_auto_repair"></a> [auto\_repair](#input\_auto\_repair) | Whether the nodes will be automatically repaired. | `bool` | `true` | no |
321321
| <a name="input_auto_upgrade"></a> [auto\_upgrade](#input\_auto\_upgrade) | Whether the nodes will be automatically upgraded. | `bool` | `false` | no |
322-
| <a name="input_autoscaling_max_node_count_per_zone"></a> [autoscaling\_max\_node\_count\_per\_zone](#input\_autoscaling\_max\_node\_count\_per\_zone) | Maximum number of nodes per zone in the NodePool. Cannot be used with autoscaling\_total\_max\_nodes. | `number` | `null` | no |
323-
| <a name="input_autoscaling_min_node_count_per_zone"></a> [autoscaling\_min\_node\_count\_per\_zone](#input\_autoscaling\_min\_node\_count\_per\_zone) | Minimum number of nodes per zone in the NodePool. Cannot be used with autoscaling\_total\_min\_nodes. | `number` | `null` | no |
322+
| <a name="input_autoscaling_max_node_count"></a> [autoscaling\_max\_node\_count](#input\_autoscaling\_max\_node\_count) | Maximum number of nodes per zone in the NodePool. Cannot be used with autoscaling\_total\_max\_nodes. | `number` | `null` | no |
323+
| <a name="input_autoscaling_min_node_count"></a> [autoscaling\_min\_node\_count](#input\_autoscaling\_min\_node\_count) | Minimum number of nodes per zone in the NodePool. Cannot be used with autoscaling\_total\_min\_nodes. | `number` | `null` | no |
324324
| <a name="input_autoscaling_total_max_nodes"></a> [autoscaling\_total\_max\_nodes](#input\_autoscaling\_total\_max\_nodes) | Total maximum number of nodes in the NodePool. | `number` | `1000` | no |
325325
| <a name="input_autoscaling_total_min_nodes"></a> [autoscaling\_total\_min\_nodes](#input\_autoscaling\_total\_min\_nodes) | Total minimum number of nodes in the NodePool. | `number` | `0` | no |
326326
| <a name="input_cluster_id"></a> [cluster\_id](#input\_cluster\_id) | projects/{{project}}/locations/{{location}}/clusters/{{cluster}} | `string` | n/a | yes |

modules/compute/gke-node-pool/main.tf

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -89,10 +89,10 @@ resource "google_container_node_pool" "node_pool" {
8989
dynamic "autoscaling" {
9090
for_each = local.static_node_set ? [] : [1]
9191
content {
92-
min_node_count = var.autoscaling_min_node_count_per_zone
93-
max_node_count = var.autoscaling_max_node_count_per_zone
94-
total_min_node_count = (var.autoscaling_min_node_count_per_zone != null || var.autoscaling_max_node_count_per_zone != null) ? null : var.autoscaling_total_min_nodes
95-
total_max_node_count = (var.autoscaling_min_node_count_per_zone != null || var.autoscaling_max_node_count_per_zone != null) ? null : var.autoscaling_total_max_nodes
92+
min_node_count = var.autoscaling_min_node_count
93+
max_node_count = var.autoscaling_max_node_count
94+
total_min_node_count = (var.autoscaling_min_node_count != null || var.autoscaling_max_node_count != null) ? null : var.autoscaling_total_min_nodes
95+
total_max_node_count = (var.autoscaling_min_node_count != null || var.autoscaling_max_node_count != null) ? null : var.autoscaling_total_max_nodes
9696
location_policy = "ANY"
9797
}
9898
}
@@ -406,14 +406,6 @@ resource "google_container_node_pool" "node_pool" {
406406
condition = var.spot == true ? (var.reservation_affinity.consume_reservation_type == "NO_RESERVATION") : true
407407
error_message = "Spot consumption option only works with reservation_affinity consume_reservation_type NO_RESERVATION."
408408
}
409-
precondition {
410-
condition = var.autoscaling_min_node_count_per_zone == null || (var.enable_flex_start && var.autoscaling_total_min_nodes == 0)
411-
error_message = "autoscaling_min_node_count_per_zone requires enable_flex_start to be enabled and is mutually exclusive with autoscaling_total_min_nodes. Please ensure autoscaling_total_min_nodes is not set."
412-
}
413-
precondition {
414-
condition = var.autoscaling_max_node_count_per_zone == null || (var.enable_flex_start && var.autoscaling_total_max_nodes == 1000)
415-
error_message = "autoscaling_max_node_count_per_zone requires enable_flex_start to be enabled and is mutually exclusive with autoscaling_total_max_nodes. Please ensure autoscaling_total_max_nodes is not set."
416-
}
417409
}
418410
}
419411

modules/compute/gke-node-pool/variables.tf

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -149,12 +149,22 @@ variable "autoscaling_total_min_nodes" {
149149
description = "Total minimum number of nodes in the NodePool."
150150
type = number
151151
default = 0
152+
153+
validation {
154+
condition = var.autoscaling_min_node_count == null || var.autoscaling_total_min_nodes == 0
155+
error_message = "autoscaling_total_min_nodes (global) is mutually exclusive with autoscaling_min_node_count (zonal). Please unset one of them."
156+
}
152157
}
153158

154159
variable "autoscaling_total_max_nodes" {
155160
description = "Total maximum number of nodes in the NodePool."
156161
type = number
157162
default = 1000
163+
164+
validation {
165+
condition = var.autoscaling_max_node_count == null || var.autoscaling_total_max_nodes == 1000
166+
error_message = "autoscaling_total_max_nodes (global) is mutually exclusive with autoscaling_max_node_count (zonal). Please unset one of them."
167+
}
158168
}
159169

160170
variable "static_node_count" {
@@ -486,14 +496,14 @@ variable "enable_numa_aware_scheduling" {
486496
default = false
487497
}
488498

489-
variable "autoscaling_min_node_count_per_zone" {
499+
variable "autoscaling_min_node_count" {
490500
# NOTE: This variable is currently only required for deploying TPU DWS Flex clusters
491501
description = "Minimum number of nodes per zone in the NodePool. Cannot be used with autoscaling_total_min_nodes."
492502
type = number
493503
default = null
494504
}
495505

496-
variable "autoscaling_max_node_count_per_zone" {
506+
variable "autoscaling_max_node_count" {
497507
# NOTE: This variable is currently only required for deploying TPU DWS Flex clusters
498508
description = "Maximum number of nodes per zone in the NodePool. Cannot be used with autoscaling_total_max_nodes."
499509
type = number

0 commit comments

Comments
 (0)