Skip to content

Commit 6a93dcf

Browse files
authored
Increase TPU CI node count from 2 to 32 (#8857)
1 parent 4190fc0 commit 6a93dcf

File tree

4 files changed

+10
-4
lines changed

4 files changed

+10
-4
lines changed

infra/terraform_modules/arc_v4_container_cluster/arc-values.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
githubConfigUrl: ${github_repo_url}
22
githubConfigSecret: github-pat
3-
minRunners: 1
3+
minRunners: ${min_tpu_nodes}
44
maxRunners: ${max_tpu_nodes}
55
template:
66
spec:

infra/terraform_modules/arc_v4_container_cluster/main.tf

+2-2
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ resource "google_container_cluster" "arc_v4_cluster" {
1717
location = "us-central2"
1818

1919
remove_default_node_pool = true
20-
initial_node_count = 1
20+
initial_node_count = var.min_tpu_nodes
2121

2222
release_channel {
2323
channel = "RAPID"
@@ -53,7 +53,7 @@ resource "google_container_node_pool" "arc_v4_tpu_nodes" {
5353
cluster = google_container_cluster.arc_v4_cluster.name
5454
initial_node_count = 1
5555
autoscaling {
56-
total_min_node_count = 1
56+
total_min_node_count = var.min_tpu_nodes
5757
total_max_node_count = var.max_tpu_nodes
5858
location_policy = "ANY"
5959
}

infra/terraform_modules/arc_v4_container_cluster/variables.tf

+5
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,11 @@ variable "tpu_nodepool_name" {
1818
type = string
1919
}
2020

21+
variable "min_tpu_nodes" {
22+
description = "Minimum number of TPU nodes and runners"
23+
type = number
24+
}
25+
2126
variable "max_tpu_nodes" {
2227
description = "Maximum number of TPU nodes and runners"
2328
type = number

infra/tpu-pytorch/tpu_ci.tf

+2-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@ module "v4_arc_cluster" {
55
cpu_nodepool_name = "cpu-nodepool"
66
cpu_node_count = 1
77
tpu_nodepool_name = "tpu-nodepool"
8-
max_tpu_nodes = 2
8+
min_tpu_nodes = 32
9+
max_tpu_nodes = 32
910
github_repo_url = "https://github.com/pytorch/xla"
1011
# Dockerfile for this image can be found at test/tpu/Dockerfile
1112
runner_image = "gcr.io/tpu-pytorch/tpu-ci-runner:latest"

0 commit comments

Comments
 (0)