File tree 4 files changed +10
-4
lines changed
terraform_modules/arc_v4_container_cluster
4 files changed +10
-4
lines changed Original file line number Diff line number Diff line change 1
1
githubConfigUrl : ${github_repo_url}
2
2
githubConfigSecret : github-pat
3
- minRunners : 1
3
+ minRunners : ${min_tpu_nodes}
4
4
maxRunners : ${max_tpu_nodes}
5
5
template :
6
6
spec :
Original file line number Diff line number Diff line change @@ -17,7 +17,7 @@ resource "google_container_cluster" "arc_v4_cluster" {
17
17
location = " us-central2"
18
18
19
19
remove_default_node_pool = true
20
- initial_node_count = 1
20
+ initial_node_count = var . min_tpu_nodes
21
21
22
22
release_channel {
23
23
channel = " RAPID"
@@ -53,7 +53,7 @@ resource "google_container_node_pool" "arc_v4_tpu_nodes" {
53
53
cluster = google_container_cluster. arc_v4_cluster . name
54
54
initial_node_count = 1
55
55
autoscaling {
56
- total_min_node_count = 1
56
+ total_min_node_count = var . min_tpu_nodes
57
57
total_max_node_count = var. max_tpu_nodes
58
58
location_policy = " ANY"
59
59
}
Original file line number Diff line number Diff line change @@ -18,6 +18,11 @@ variable "tpu_nodepool_name" {
18
18
type = string
19
19
}
20
20
21
+ variable "min_tpu_nodes" {
22
+ description = " Minimum number of TPU nodes and runners"
23
+ type = number
24
+ }
25
+
21
26
variable "max_tpu_nodes" {
22
27
description = " Maximum number of TPU nodes and runners"
23
28
type = number
Original file line number Diff line number Diff line change @@ -5,7 +5,8 @@ module "v4_arc_cluster" {
5
5
cpu_nodepool_name = " cpu-nodepool"
6
6
cpu_node_count = 1
7
7
tpu_nodepool_name = " tpu-nodepool"
8
- max_tpu_nodes = 2
8
+ min_tpu_nodes = 32
9
+ max_tpu_nodes = 32
9
10
github_repo_url = " https://github.com/pytorch/xla"
10
11
# Dockerfile for this image can be found at test/tpu/Dockerfile
11
12
runner_image = " gcr.io/tpu-pytorch/tpu-ci-runner:latest"
You can’t perform that action at this time.
0 commit comments