reverting per_zone var changes to provider defaults and added validations

shubpal07 · shubpal07 · commit e28dd7f0a6cb · 2026-01-27T11:54:16.000Z
Change-Id: I46912057ac6fb59dc1f106eaf01cc73db7a97432
diff --git a/examples/gke-consumption-options/dws-flex-start/gke-tpu-7x/README.md b/examples/gke-consumption-options/dws-flex-start/gke-tpu-7x/README.md
@@ -11,8 +11,8 @@ These steps guide you through the cluster creation process for TPUs using DWS Fl
     `region`: the compute region for the cluster.
     `zone`: the compute zone for the node pool of TPU 7x machines.
     **`enable_flex_start`**: set to `true` to enable DWS Flex Start.
-    **`autoscaling_min_node_count_per_zone`**: set to `0` (required for Flex Start).
-    **`autoscaling_max_node_count_per_zone`**: set to the required node count for your topology (e.g., `2` for a `2x2x2` topology).
+    **`autoscaling_min_node_count`**: set to `0` (required for Flex Start).
+    **`autoscaling_max_node_count`**: set to the required node count for your topology (e.g., `2` for a `2x2x2` topology).
     `authorized_cidr`: The IP address range that you want to allow to connect with the cluster.
     To modify advanced settings, edit `examples/gke-consumption-options/dws-flex-start/gke-tpu-7x/gke-tpu-7x.yaml`.
 
diff --git a/examples/gke-consumption-options/dws-flex-start/gke-tpu-7x/gke-tpu-7x-deployment.yaml b/examples/gke-consumption-options/dws-flex-start/gke-tpu-7x/gke-tpu-7x-deployment.yaml
@@ -51,8 +51,8 @@ vars:
   # num chips for the machine type (e.g. 4 for tpu7x-standard-4t).
   # Reference: https://cloud.google.com/tpu/docs/v6e
   enable_flex_start: true
-  autoscaling_min_node_count_per_zone: 0
-  autoscaling_max_node_count_per_zone: 2
+  autoscaling_min_node_count: 0
+  autoscaling_max_node_count: 2
 
   # Cidr block containing the IP of the machine calling terraform.
   # To allow all (IAM restrictions still enforced), use 0.0.0.0/0
diff --git a/examples/gke-consumption-options/dws-flex-start/gke-tpu-7x/gke-tpu-7x.yaml b/examples/gke-consumption-options/dws-flex-start/gke-tpu-7x/gke-tpu-7x.yaml
@@ -45,8 +45,8 @@ vars:
 
   # --- FLEX START SETTINGS ---
   enable_flex_start: true
-  autoscaling_min_node_count_per_zone:
-  autoscaling_max_node_count_per_zone:
+  autoscaling_min_node_count:
+  autoscaling_max_node_count:
 
 # Kueue configuration
   kueue_configuration_path: $(ghpc_stage("./kueue-configuration.yaml.tftpl"))
@@ -200,8 +200,8 @@ deployment_groups:
       zones: [$(vars.zone)]
       enable_flex_start: $(vars.enable_flex_start)
       auto_repair: false # flex-start dependency
-      autoscaling_min_node_count_per_zone: $(vars.autoscaling_min_node_count_per_zone)
-      autoscaling_max_node_count_per_zone: $(vars.autoscaling_max_node_count_per_zone)
+      autoscaling_min_node_count: $(vars.autoscaling_min_node_count)
+      autoscaling_max_node_count: $(vars.autoscaling_max_node_count)
       additional_networks:
         $(concat(
           [{
@@ -227,7 +227,7 @@ deployment_groups:
         install: true
         config_path: $(vars.kueue_configuration_path)
         config_template_vars:
-          tpu_quota: $(vars.num_slices * vars.autoscaling_max_node_count_per_zone * gke-tpu-7x-pool.tpu_chips_per_node)
+          tpu_quota: $(vars.num_slices * vars.autoscaling_max_node_count * gke-tpu-7x-pool.tpu_chips_per_node)
           accelerator_type: $(gke-tpu-7x-pool.tpu_accelerator_type)
       jobset:
         install: true
@@ -270,7 +270,7 @@ deployment_groups:
     source: modules/compute/gke-job-template
     use: [checkpointing-pv, training-pv, gke-tpu-7x-pool]
     settings:
-      node_count: $(vars.autoscaling_max_node_count_per_zone)
+      node_count: $(vars.autoscaling_max_node_count)
       security_context:  # to make sure the job have enough access to install the fio packages
       - key: runAsUser
         value: 0
diff --git a/examples/gke-consumption-options/dws-flex-start/gke-tpu-v6e/README.md b/examples/gke-consumption-options/dws-flex-start/gke-tpu-v6e/README.md
@@ -11,8 +11,8 @@ These steps guide you through the cluster creation process for TPUs using DWS Fl
     `region`: the compute region for the cluster.
     `zone`: the compute zone for the node pool of TPU v6e machines.
     **`enable_flex_start`**: set to `true` to enable DWS Flex Start.
-    **`autoscaling_min_node_count_per_zone`**: set to `0` (required for Flex Start).
-    **`autoscaling_max_node_count_per_zone`**: set to the required node count for your topology (e.g., `4` for a `4x4` topology).
+    **`autoscaling_min_node_count`**: set to `0` (required for Flex Start).
+    **`autoscaling_max_node_count`**: set to the required node count for your topology (e.g., `4` for a `4x4` topology).
     `authorized_cidr`: The IP address range that you want to allow to connect with the cluster.
     `system_node_pool_disk_size_gb`: the size of disk for each node of the system node pool.
     `v6e_node_pool_disk_size_gb`: the size of disk for each node of the TPU node pool.
diff --git a/examples/gke-consumption-options/dws-flex-start/gke-tpu-v6e/gke-tpu-v6e-deployment.yaml b/examples/gke-consumption-options/dws-flex-start/gke-tpu-v6e/gke-tpu-v6e-deployment.yaml
@@ -49,8 +49,8 @@ vars:
   # num chips for the machine type (e.g. 4 for ct6e-standard-4t).
   # Reference: https://cloud.google.com/tpu/docs/v6e
   enable_flex_start: true
-  autoscaling_min_node_count_per_zone: 0
-  autoscaling_max_node_count_per_zone: 4
+  autoscaling_min_node_count: 0
+  autoscaling_max_node_count: 4
 
   # Cidr block containing the IP of the machine calling terraform.
   # To allow all (IAM restrictions still enforced), use 0.0.0.0/0
diff --git a/examples/gke-consumption-options/dws-flex-start/gke-tpu-v6e/gke-tpu-v6e.yaml b/examples/gke-consumption-options/dws-flex-start/gke-tpu-v6e/gke-tpu-v6e.yaml
@@ -49,8 +49,8 @@ vars:
 
   # --- FLEX START SETTINGS ---
   enable_flex_start: true
-  autoscaling_min_node_count_per_zone:
-  autoscaling_max_node_count_per_zone:
+  autoscaling_min_node_count:
+  autoscaling_max_node_count:
 
  # Kueue configuration
   kueue_configuration_path: $(ghpc_stage("./kueue-configuration.yaml.tftpl"))
@@ -196,8 +196,8 @@ deployment_groups:
       disk_size_gb: $(vars.v6e_node_pool_disk_size_gb)
       enable_flex_start: $(vars.enable_flex_start)
       auto_repair: false # flex-start dependency
-      autoscaling_min_node_count_per_zone: $(vars.autoscaling_min_node_count_per_zone)
-      autoscaling_max_node_count_per_zone: $(vars.autoscaling_max_node_count_per_zone)
+      autoscaling_min_node_count: $(vars.autoscaling_min_node_count)
+      autoscaling_max_node_count: $(vars.autoscaling_max_node_count)
       additional_networks:
         $(concat(
           [{
@@ -226,7 +226,7 @@ deployment_groups:
         install: true
         config_path: $(vars.kueue_configuration_path)
         config_template_vars:
-          tpu_quota: $(vars.num_slices * vars.autoscaling_max_node_count_per_zone * gke-tpu-v6-pool.tpu_chips_per_node)
+          tpu_quota: $(vars.num_slices * vars.autoscaling_max_node_count * gke-tpu-v6-pool.tpu_chips_per_node)
           accelerator_type: $(gke-tpu-v6-pool.tpu_accelerator_type)
       jobset:
         install: true
@@ -269,7 +269,7 @@ deployment_groups:
     source: modules/compute/gke-job-template
     use: [checkpointing-pv, training-pv, gke-tpu-v6-pool]
     settings:
-      node_count: $(vars.autoscaling_max_node_count_per_zone)
+      node_count: $(vars.autoscaling_max_node_count)
       security_context:  # to make sure the job have enough access to install the fio packages
       - key: runAsUser
         value: 0
diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md
@@ -319,8 +319,8 @@ limitations under the License.
 | <a name="input_additional_networks"></a> [additional\_networks](#input\_additional\_networks) | Additional network interface details for GKE, if any. Providing additional networks adds additional node networks to the node pool | <pre>list(object({<br/>    network            = string<br/>    subnetwork         = string<br/>    subnetwork_project = string<br/>    network_ip         = string<br/>    nic_type           = string<br/>    stack_type         = string<br/>    queue_count        = number<br/>    access_config = list(object({<br/>      nat_ip       = string<br/>      network_tier = string<br/>    }))<br/>    ipv6_access_config = list(object({<br/>      network_tier = string<br/>    }))<br/>    alias_ip_range = list(object({<br/>      ip_cidr_range         = string<br/>      subnetwork_range_name = string<br/>    }))<br/>  }))</pre> | `[]` | no |
 | <a name="input_auto_repair"></a> [auto\_repair](#input\_auto\_repair) | Whether the nodes will be automatically repaired. | `bool` | `true` | no |
 | <a name="input_auto_upgrade"></a> [auto\_upgrade](#input\_auto\_upgrade) | Whether the nodes will be automatically upgraded. | `bool` | `false` | no |
-| <a name="input_autoscaling_max_node_count_per_zone"></a> [autoscaling\_max\_node\_count\_per\_zone](#input\_autoscaling\_max\_node\_count\_per\_zone) | Maximum number of nodes per zone in the NodePool. Cannot be used with autoscaling\_total\_max\_nodes. | `number` | `null` | no |
-| <a name="input_autoscaling_min_node_count_per_zone"></a> [autoscaling\_min\_node\_count\_per\_zone](#input\_autoscaling\_min\_node\_count\_per\_zone) | Minimum number of nodes per zone in the NodePool. Cannot be used with autoscaling\_total\_min\_nodes. | `number` | `null` | no |
+| <a name="input_autoscaling_max_node_count"></a> [autoscaling\_max\_node\_count](#input\_autoscaling\_max\_node\_count) | Maximum number of nodes per zone in the NodePool. Cannot be used with autoscaling\_total\_max\_nodes. | `number` | `null` | no |
+| <a name="input_autoscaling_min_node_count"></a> [autoscaling\_min\_node\_count](#input\_autoscaling\_min\_node\_count) | Minimum number of nodes per zone in the NodePool. Cannot be used with autoscaling\_total\_min\_nodes. | `number` | `null` | no |
 | <a name="input_autoscaling_total_max_nodes"></a> [autoscaling\_total\_max\_nodes](#input\_autoscaling\_total\_max\_nodes) | Total maximum number of nodes in the NodePool. | `number` | `1000` | no |
 | <a name="input_autoscaling_total_min_nodes"></a> [autoscaling\_total\_min\_nodes](#input\_autoscaling\_total\_min\_nodes) | Total minimum number of nodes in the NodePool. | `number` | `0` | no |
 | <a name="input_cluster_id"></a> [cluster\_id](#input\_cluster\_id) | projects/{{project}}/locations/{{location}}/clusters/{{cluster}} | `string` | n/a | yes |
diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf
@@ -89,10 +89,10 @@ resource "google_container_node_pool" "node_pool" {
   dynamic "autoscaling" {
     for_each = local.static_node_set ? [] : [1]
     content {
-      min_node_count       = var.autoscaling_min_node_count_per_zone
-      max_node_count       = var.autoscaling_max_node_count_per_zone
-      total_min_node_count = (var.autoscaling_min_node_count_per_zone != null || var.autoscaling_max_node_count_per_zone != null) ? null : var.autoscaling_total_min_nodes
-      total_max_node_count = (var.autoscaling_min_node_count_per_zone != null || var.autoscaling_max_node_count_per_zone != null) ? null : var.autoscaling_total_max_nodes
+      min_node_count       = var.autoscaling_min_node_count
+      max_node_count       = var.autoscaling_max_node_count
+      total_min_node_count = (var.autoscaling_min_node_count != null || var.autoscaling_max_node_count != null) ? null : var.autoscaling_total_min_nodes
+      total_max_node_count = (var.autoscaling_min_node_count != null || var.autoscaling_max_node_count != null) ? null : var.autoscaling_total_max_nodes
       location_policy      = "ANY"
     }
   }
@@ -406,14 +406,6 @@ resource "google_container_node_pool" "node_pool" {
       condition     = var.spot == true ? (var.reservation_affinity.consume_reservation_type == "NO_RESERVATION") : true
       error_message = "Spot consumption option only works with reservation_affinity consume_reservation_type NO_RESERVATION."
     }
-    precondition {
-      condition     = var.autoscaling_min_node_count_per_zone == null || (var.enable_flex_start && var.autoscaling_total_min_nodes == 0)
-      error_message = "autoscaling_min_node_count_per_zone requires enable_flex_start to be enabled and is mutually exclusive with autoscaling_total_min_nodes. Please ensure autoscaling_total_min_nodes is not set."
-    }
-    precondition {
-      condition     = var.autoscaling_max_node_count_per_zone == null || (var.enable_flex_start && var.autoscaling_total_max_nodes == 1000)
-      error_message = "autoscaling_max_node_count_per_zone requires enable_flex_start to be enabled and is mutually exclusive with autoscaling_total_max_nodes. Please ensure autoscaling_total_max_nodes is not set."
-    }
   }
 }
 
diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf
@@ -149,12 +149,22 @@ variable "autoscaling_total_min_nodes" {
   description = "Total minimum number of nodes in the NodePool."
   type        = number
   default     = 0
+
+  validation {
+    condition     = var.autoscaling_min_node_count == null || var.autoscaling_total_min_nodes == 0
+    error_message = "autoscaling_total_min_nodes (global) is mutually exclusive with autoscaling_min_node_count (zonal). Please unset one of them."
+  }
 }
 
 variable "autoscaling_total_max_nodes" {
   description = "Total maximum number of nodes in the NodePool."
   type        = number
   default     = 1000
+
+  validation {
+    condition     = var.autoscaling_max_node_count == null || var.autoscaling_total_max_nodes == 1000
+    error_message = "autoscaling_total_max_nodes (global) is mutually exclusive with autoscaling_max_node_count (zonal). Please unset one of them."
+  }
 }
 
 variable "static_node_count" {
@@ -486,14 +496,14 @@ variable "enable_numa_aware_scheduling" {
   default     = false
 }
 
-variable "autoscaling_min_node_count_per_zone" {
+variable "autoscaling_min_node_count" {
   # NOTE: This variable is currently only required for deploying TPU DWS Flex clusters
   description = "Minimum number of nodes per zone in the NodePool. Cannot be used with autoscaling_total_min_nodes."
   type        = number
   default     = null
 }
 
-variable "autoscaling_max_node_count_per_zone" {
+variable "autoscaling_max_node_count" {
   # NOTE: This variable is currently only required for deploying TPU DWS Flex clusters
   description = "Maximum number of nodes per zone in the NodePool. Cannot be used with autoscaling_total_max_nodes."
   type        = number

Original file line number	Diff line number	Diff line change
`@@ -89,10 +89,10 @@ resource "google_container_node_pool" "node_pool" {`
`89`	`89`	`dynamic "autoscaling" {`
`90`	`90`	`for_each = local.static_node_set ? [] : [1]`
`91`	`91`	`content {`
`92`		`- min_node_count = var.autoscaling_min_node_count_per_zone`
`93`		`- max_node_count = var.autoscaling_max_node_count_per_zone`
`94`		`- total_min_node_count = (var.autoscaling_min_node_count_per_zone != null \|\| var.autoscaling_max_node_count_per_zone != null) ? null : var.autoscaling_total_min_nodes`
`95`		`- total_max_node_count = (var.autoscaling_min_node_count_per_zone != null \|\| var.autoscaling_max_node_count_per_zone != null) ? null : var.autoscaling_total_max_nodes`
	`92`	`+ min_node_count = var.autoscaling_min_node_count`
	`93`	`+ max_node_count = var.autoscaling_max_node_count`
	`94`	`+ total_min_node_count = (var.autoscaling_min_node_count != null \|\| var.autoscaling_max_node_count != null) ? null : var.autoscaling_total_min_nodes`
	`95`	`+ total_max_node_count = (var.autoscaling_min_node_count != null \|\| var.autoscaling_max_node_count != null) ? null : var.autoscaling_total_max_nodes`
`96`	`96`	`location_policy = "ANY"`
`97`	`97`	`}`
`98`	`98`	`}`
`@@ -406,14 +406,6 @@ resource "google_container_node_pool" "node_pool" {`
`406`	`406`	`condition = var.spot == true ? (var.reservation_affinity.consume_reservation_type == "NO_RESERVATION") : true`
`407`	`407`	`error_message = "Spot consumption option only works with reservation_affinity consume_reservation_type NO_RESERVATION."`
`408`	`408`	`}`
`409`		`- precondition {`
`410`		`- condition = var.autoscaling_min_node_count_per_zone == null \|\| (var.enable_flex_start && var.autoscaling_total_min_nodes == 0)`
`411`		`- error_message = "autoscaling_min_node_count_per_zone requires enable_flex_start to be enabled and is mutually exclusive with autoscaling_total_min_nodes. Please ensure autoscaling_total_min_nodes is not set."`
`412`		`- }`
`413`		`- precondition {`
`414`		`- condition = var.autoscaling_max_node_count_per_zone == null \|\| (var.enable_flex_start && var.autoscaling_total_max_nodes == 1000)`
`415`		`- error_message = "autoscaling_max_node_count_per_zone requires enable_flex_start to be enabled and is mutually exclusive with autoscaling_total_max_nodes. Please ensure autoscaling_total_max_nodes is not set."`
`416`		`- }`
`417`	`409`	`}`
`418`	`410`	`}`
`419`	`411`