Skip to content

Commit cd3ebc7

Browse files
Merge branch 'master' into dvoros-test
2 parents 7750b71 + b1b8869 commit cd3ebc7

File tree

8 files changed

+439
-0
lines changed

8 files changed

+439
-0
lines changed
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# GKE and CAST AI example with connecting via GCP Private Service Connect
2+
3+
Following example shows how to onboard GKE cluster to CAST AI, configure [Autoscaler policies](https://docs.cast.ai/reference/policiesapi_upsertclusterpolicies) and additional [Node Configurations](https://docs.cast.ai/docs/node-configuration/).
4+
5+
IAM policies required to connect the cluster to CAST AI in the example are created by [castai/gke-role-iam/castai module](https://github.com/castai/terraform-castai-gke-iam).
6+
7+
Example configuration should be analysed in the following order:
8+
1. Create VPC - `vpc.tf`
9+
2. Private Service Connect - `private-service-connect.tf`
10+
3. Create GKE cluster - `gke.tf`
11+
4. Create IAM and other CAST AI related resources to connect GKE cluster to CAST AI, configure Autoscaler and Node Configurations - `castai.tf`
12+
13+
## What is unique about this example
14+
15+
This example connects the GKE cluster to Cast AI through GCP Private Service Connect.
16+
17+
## Usage
18+
1. Rename `tf.vars.example` to `tf.vars`
19+
2. Update `tf.vars` file necessary variables.
20+
3. Initialize Terraform. Under example root folder run:
21+
```
22+
terraform init
23+
```
24+
4. Run Terraform apply:
25+
```
26+
terraform apply -var-file=tf.vars
27+
```
28+
5. To destroy resources created by this example:
29+
```
30+
terraform destroy -var-file=tf.vars
31+
```
32+
33+
Please refer to this guide if you run into any issues https://docs.cast.ai/docs/terraform-troubleshooting
Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
# 5. Connect GKE cluster to CAST AI
2+
3+
# Configure Data sources and providers required for CAST AI connection.
4+
5+
data "google_client_config" "default" {}
6+
7+
provider "castai" {
8+
api_url = var.castai_public_api_url
9+
api_token = var.castai_api_token
10+
}
11+
12+
provider "helm" {
13+
kubernetes = {
14+
host = "https://${module.gke.endpoint}"
15+
token = data.google_client_config.default.access_token
16+
cluster_ca_certificate = base64decode(module.gke.ca_certificate)
17+
}
18+
}
19+
20+
# Configure GKE cluster connection using CAST AI gke-cluster module.
21+
module "castai-gke-iam" {
22+
source = "castai/gke-iam/castai"
23+
version = "~> 0.5"
24+
25+
project_id = var.project_id
26+
gke_cluster_name = var.cluster_name
27+
}
28+
29+
module "castai-gke-cluster" {
30+
source = "castai/gke-cluster/castai"
31+
version = "~> 9.1"
32+
33+
api_url = var.castai_public_api_url
34+
castware_api_url = "https://api.psc.${var.castai_api_private_domain}"
35+
castai_api_token = var.castai_api_token
36+
grpc_url = "grpc.psc.${var.castai_api_private_domain}:443"
37+
wait_for_cluster_ready = true
38+
39+
project_id = var.project_id
40+
gke_cluster_name = var.cluster_name
41+
gke_cluster_location = module.gke.location
42+
43+
gke_credentials = module.castai-gke-iam.private_key
44+
delete_nodes_on_disconnect = var.delete_nodes_on_disconnect
45+
46+
default_node_configuration_name = "default"
47+
48+
node_configurations = {
49+
default = {
50+
disk_cpu_ratio = 25
51+
subnets = [module.vpc.subnets_ids[0]]
52+
tags = var.tags
53+
}
54+
55+
test_node_config = {
56+
disk_cpu_ratio = 10
57+
subnets = [module.vpc.subnets_ids[0]]
58+
tags = var.tags
59+
max_pods_per_node = 40
60+
disk_type = "pd-ssd",
61+
network_tags = ["dev"]
62+
}
63+
64+
}
65+
66+
node_templates = {
67+
default_by_castai = {
68+
name = "default-by-castai"
69+
configuration_name = "default"
70+
is_default = true
71+
is_enabled = true
72+
should_taint = false
73+
74+
constraints = {
75+
on_demand = true
76+
spot = true
77+
use_spot_fallbacks = true
78+
79+
enable_spot_diversity = false
80+
spot_diversity_price_increase_limit_percent = 20
81+
}
82+
}
83+
84+
spot_tmpl = {
85+
configuration_id = module.castai-gke-cluster.castai_node_configurations["default"]
86+
is_enabled = true
87+
should_taint = true
88+
89+
custom_labels = {
90+
custom-label-key-1 = "custom-label-value-1"
91+
custom-label-key-2 = "custom-label-value-2"
92+
}
93+
94+
custom_taints = [
95+
{
96+
key = "custom-taint-key-1"
97+
value = "custom-taint-value-1"
98+
effect = "NoSchedule"
99+
},
100+
{
101+
key = "custom-taint-key-2"
102+
value = "custom-taint-value-2"
103+
effect = "NoSchedule"
104+
}
105+
]
106+
107+
constraints = {
108+
fallback_restore_rate_seconds = 1800
109+
spot = true
110+
use_spot_fallbacks = true
111+
min_cpu = 4
112+
max_cpu = 100
113+
instance_families = {
114+
exclude = ["e2"]
115+
}
116+
compute_optimized_state = "disabled"
117+
storage_optimized_state = "disabled"
118+
# Optional: define custom priority for instances selection.
119+
#
120+
# 1. Prioritize C2D and C2 spot instances above all else, regardless of price.
121+
# 2. If C2D and C2 is not available, try C3D family.
122+
custom_priority = [
123+
{
124+
instance_families = ["c2d", "c2"]
125+
spot = true
126+
},
127+
{
128+
instance_families = ["c3d"]
129+
spot = true
130+
}
131+
# 3. instances not matching any of custom priority groups will be tried after
132+
# nothing matches from priority groups.
133+
]
134+
}
135+
custom_instances_enabled = true
136+
}
137+
}
138+
139+
autoscaler_settings = {
140+
enabled = true
141+
node_templates_partial_matching_enabled = false
142+
143+
unschedulable_pods = {
144+
enabled = true
145+
}
146+
147+
node_downscaler = {
148+
enabled = true
149+
150+
empty_nodes = {
151+
enabled = true
152+
}
153+
154+
evictor = {
155+
aggressive_mode = false
156+
cycle_interval = "5m10s"
157+
dry_run = false
158+
enabled = true
159+
node_grace_period_minutes = 10
160+
scoped_mode = false
161+
}
162+
}
163+
164+
cluster_limits = {
165+
enabled = true
166+
167+
cpu = {
168+
max_cores = 20
169+
min_cores = 1
170+
}
171+
}
172+
}
173+
174+
depends_on = [
175+
// depends_on helps terraform with creating proper dependencies graph in case of resource creation and in this case destroy
176+
// module "castai-gke-cluster" has to be destroyed before module "castai-gke-iam" and "module.gke"
177+
module.gke,
178+
module.castai-gke-iam,
179+
// DNS record must be created before onboarding to Cast AI
180+
google_dns_record_set.a,
181+
// Private Service Connect Endpoint must be created before onboarding to Cast AI
182+
google_compute_forwarding_rule.cast_ai_private_api
183+
]
184+
}
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# 4. Create GKE cluster.
2+
3+
module "gke" {
4+
source = "terraform-google-modules/kubernetes-engine/google"
5+
version = "33.1.0"
6+
project_id = var.project_id
7+
name = var.cluster_name
8+
region = var.cluster_region
9+
zones = var.cluster_zones
10+
network = module.vpc.network_name
11+
subnetwork = module.vpc.subnets_names[0]
12+
ip_range_pods = local.ip_range_pods
13+
ip_range_services = local.ip_range_services
14+
http_load_balancing = false
15+
network_policy = false
16+
horizontal_pod_autoscaling = true
17+
filestore_csi_driver = false
18+
19+
node_pools = [
20+
{
21+
name = "default-node-pool"
22+
machine_type = "e2-standard-2"
23+
min_count = 0
24+
max_count = 10
25+
local_ssd_count = 0
26+
disk_size_gb = 100
27+
disk_type = "pd-standard"
28+
image_type = "COS_CONTAINERD"
29+
auto_repair = true
30+
auto_upgrade = true
31+
preemptible = false
32+
initial_node_count = 2 # has to be >=2 to successfully deploy CAST AI controller
33+
},
34+
]
35+
36+
deletion_protection = false
37+
}
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
# 2. PSC Endpoint setup
2+
3+
resource "google_compute_address" "cast_ai_private_api" {
4+
project = var.project_id
5+
name = "cast-ai-private-api"
6+
region = module.vpc.subnets_regions[1]
7+
address_type = "INTERNAL"
8+
subnetwork = module.vpc.subnets_self_links[1]
9+
address = cidrhost(module.vpc.subnets_ips[1], 2)
10+
}
11+
12+
resource "google_compute_forwarding_rule" "cast_ai_private_api" {
13+
project = var.project_id
14+
name = "cast-ai-private-api"
15+
target = var.cast_api_service_attachment_uri
16+
network = module.vpc.network_id
17+
region = module.vpc.subnets_regions[1]
18+
ip_address = google_compute_address.cast_ai_private_api.id
19+
load_balancing_scheme = ""
20+
allow_psc_global_access = var.allow_psc_global_access
21+
}
22+
23+
24+
# 3. DNS setup
25+
26+
resource "google_dns_managed_zone" "psc_zone" {
27+
name = "cast-ai-psc-zone"
28+
project = var.project_id
29+
dns_name = "${var.castai_api_private_domain}."
30+
description = "Cast AI Private Service Connect zone"
31+
32+
visibility = "private"
33+
34+
private_visibility_config {
35+
networks {
36+
network_url = module.vpc.network_id
37+
}
38+
}
39+
}
40+
41+
resource "google_dns_record_set" "a" {
42+
name = "*.psc.${google_dns_managed_zone.psc_zone.dns_name}"
43+
project = var.project_id
44+
managed_zone = google_dns_managed_zone.psc_zone.name
45+
type = "A"
46+
ttl = 300
47+
48+
rrdatas = [google_compute_address.cast_ai_private_api.address]
49+
}
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
project_id = "<place-holder>"
2+
cluster_name = "<place-holder>"
3+
cluster_region = "<place-holder>"
4+
cluster_zones = ["<place-holder>", "<place-holder>"]
5+
castai_public_api_url = "https://api.prod-master.cast.ai"
6+
castai_api_private_domain = "prod-master.cast.ai"
7+
castai_api_token = "<place-holder>"
8+
cast_api_service_attachment_uri = "<place-holder>"
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
# GKE module variables.
2+
variable "cluster_name" {
3+
type = string
4+
description = "GKE cluster name in GCP project."
5+
}
6+
7+
variable "cluster_region" {
8+
type = string
9+
description = "The region to create the cluster."
10+
}
11+
12+
variable "cluster_zones" {
13+
type = list(string)
14+
description = "The zones to create the cluster."
15+
default = []
16+
}
17+
18+
variable "project_id" {
19+
type = string
20+
description = "GCP project ID in which GKE cluster would be created."
21+
}
22+
23+
variable "castai_public_api_url" {
24+
type = string
25+
description = "URL of public CAST AI API"
26+
default = "https://api.cast.ai"
27+
}
28+
29+
variable "castai_api_token" {
30+
type = string
31+
description = "CAST AI API token created in console.cast.ai API Access keys section."
32+
}
33+
34+
variable "castai_api_private_domain" {
35+
type = string
36+
description = "Private domain used to access Cast AI via Private Service Connect"
37+
default = "prod-master.cast.ai"
38+
}
39+
40+
variable "cast_api_service_attachment_uri" {
41+
type = string
42+
description = "Service Attachment URI to connect to."
43+
default = "projects/prod-master-scl0/regions/us-east4/serviceAttachments/castware-psc"
44+
}
45+
46+
variable "allow_psc_global_access" {
47+
type = bool
48+
description = "Allow global access to the Private Service Connect Endpoint. If set to false, the cluster must be in the same region as the Service Attachment."
49+
default = true
50+
}
51+
52+
variable "delete_nodes_on_disconnect" {
53+
type = bool
54+
description = "Optional parameter, if set to true - CAST AI provisioned nodes will be deleted from cloud on cluster disconnection. For production use it is recommended to set it to false."
55+
default = true
56+
}
57+
58+
variable "tags" {
59+
type = map(any)
60+
description = "Optional tags for new cluster nodes. This parameter applies only to new nodes - tags for old nodes are not reconciled."
61+
default = {}
62+
}

0 commit comments

Comments
 (0)