Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .env.template
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ CLIENT_MACHINE_TYPE=
CLIENT_CLUSTER_SIZE=
# Max number of additional instances if the CPU usage is above 80%, e.g. 0
CLIENT_CLUSTER_AUTO_SCALING_MAX=
# e.g. 1
CLIENT_REGIONAL_CLUSTER_SIZE=

# This is the nomad and consul server (only for scheduling and service discovery)
# eg e2-standard-2
Expand Down
61 changes: 32 additions & 29 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,36 +5,39 @@ ENV_FILE := $(PWD)/.env.${ENV}

TF := $(shell which terraform)
TERRAFORM_STATE_BUCKET ?= $(GCP_PROJECT_ID)-terraform-state
OTEL_TRACING_PRINT ?= false
TEMPLATE_BUCKET_LOCATION ?= $(GCP_REGION)
CLIENT_CLUSTER_AUTO_SCALING_MAX ?= 0
REDIS_MANAGED ?= false
GRAFANA_MANAGED ?= false

tf_vars := TF_VAR_client_machine_type=$(CLIENT_MACHINE_TYPE) \
TF_VAR_client_cluster_size=$(CLIENT_CLUSTER_SIZE) \
TF_VAR_client_cluster_auto_scaling_max=$(CLIENT_CLUSTER_AUTO_SCALING_MAX) \
TF_VAR_api_machine_type=$(API_MACHINE_TYPE) \
TF_VAR_api_cluster_size=$(API_CLUSTER_SIZE) \
TF_VAR_build_machine_type=$(BUILD_MACHINE_TYPE) \
TF_VAR_build_cluster_size=$(BUILD_CLUSTER_SIZE) \
TF_VAR_server_machine_type=$(SERVER_MACHINE_TYPE) \
TF_VAR_server_cluster_size=$(SERVER_CLUSTER_SIZE) \
TF_VAR_clickhouse_cluster_size=$(CLICKHOUSE_CLUSTER_SIZE) \
TF_VAR_clickhouse_machine_type=$(CLICKHOUSE_MACHINE_TYPE) \
TF_VAR_gcp_project_id=$(GCP_PROJECT_ID) \
TF_VAR_gcp_region=$(GCP_REGION) \
TF_VAR_gcp_zone=$(GCP_ZONE) \
TF_VAR_domain_name=$(DOMAIN_NAME) \
TF_VAR_additional_domains=$(ADDITIONAL_DOMAINS) \
TF_VAR_prefix=$(PREFIX) \
TF_VAR_terraform_state_bucket=$(TERRAFORM_STATE_BUCKET) \
TF_VAR_otel_tracing_print=$(OTEL_TRACING_PRINT) \
TF_VAR_environment=$(TERRAFORM_ENVIRONMENT) \
TF_VAR_template_bucket_name=$(TEMPLATE_BUCKET_NAME) \
TF_VAR_template_bucket_location=$(TEMPLATE_BUCKET_LOCATION) \
TF_VAR_redis_managed=$(REDIS_MANAGED) \
TF_VAR_grafana_managed=$(GRAFANA_MANAGED)

# Set the terraform environment variable only if the environment variable is set
# Strip the passed variable name (it's space sensitive) and check if the variable is set, if yes return TF_VAR_<variable_name>=<value> with the variable name in lower case
define tfvar
$(if $(value $(strip $(1))), TF_VAR_$(shell echo $(strip $(1)) | tr A-Z a-z)=$($(strip $(1))))
endef

tf_vars := TF_VAR_environment=$(TERRAFORM_ENVIRONMENT) \
$(call tfvar, CLIENT_MACHINE_TYPE) \
$(call tfvar, CLIENT_CLUSTER_SIZE) \
$(call tfvar, CLIENT_REGIONAL_CLUSTER_SIZE) \
$(call tfvar, CLIENT_CLUSTER_AUTO_SCALING_MAX) \
$(call tfvar, API_MACHINE_TYPE) \
$(call tfvar, API_CLUSTER_SIZE) \
$(call tfvar, BUILD_MACHINE_TYPE) \
$(call tfvar, BUILD_CLUSTER_SIZE) \
$(call tfvar, SERVER_MACHINE_TYPE) \
$(call tfvar, SERVER_CLUSTER_SIZE) \
$(call tfvar, CLICKHOUSE_CLUSTER_SIZE) \
$(call tfvar, CLICKHOUSE_MACHINE_TYPE) \
$(call tfvar, GCP_PROJECT_ID) \
$(call tfvar, GCP_REGION) \
$(call tfvar, GCP_ZONE) \
$(call tfvar, DOMAIN_NAME) \
$(call tfvar, ADDITIONAL_DOMAINS) \
$(call tfvar, PREFIX) \
$(call tfvar, TERRAFORM_STATE_BUCKET) \
$(call tfvar, OTEL_TRACING_PRINT) \
$(call tfvar, TEMPLATE_BUCKET_NAME) \
$(call tfvar, TEMPLATE_BUCKET_LOCATION) \
$(call tfvar, REDIS_MANAGED) \
$(call tfvar, GRAFANA_MANAGED)

# Login for Packer and Docker (uses gcloud user creds)
# Login for Terraform (uses application default creds)
Expand Down
3 changes: 1 addition & 2 deletions main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ module "cluster" {

server_cluster_size = var.server_cluster_size
client_cluster_size = var.client_cluster_size
client_regional_cluster_size = var.client_regional_cluster_size
client_cluster_auto_scaling_max = var.client_cluster_auto_scaling_max
api_cluster_size = var.api_cluster_size
build_cluster_size = var.build_cluster_size
Expand Down Expand Up @@ -127,8 +128,6 @@ module "cluster" {
consul_acl_token_secret = module.init.consul_acl_token_secret
nomad_acl_token_secret = module.init.nomad_acl_token_secret

notification_email_secret_version = module.init.notification_email_secret_version

labels = var.labels
prefix = var.prefix
}
Expand Down
68 changes: 64 additions & 4 deletions packages/cluster/client/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,70 @@ resource "google_compute_instance_group_manager" "client_cluster" {
]
}

resource "google_compute_region_autoscaler" "client" {
provider = google-beta

name = "${var.cluster_name}-client-autoscaler"
region = var.gcp_region
target = google_compute_region_instance_group_manager.client_cluster.id

autoscaling_policy {
max_replicas = var.regional_cluster_size + var.cluster_auto_scaling_max
min_replicas = var.regional_cluster_size
cooldown_period = 240
mode = "ONLY_SCALE_OUT"

cpu_utilization {
target = 0.6
}
}
}

resource "google_compute_region_instance_group_manager" "client_cluster" {
name = "${var.cluster_name}-rig"
region = var.gcp_region

version {
name = google_compute_instance_template.client.id
instance_template = google_compute_instance_template.client.id
}

named_port {
name = var.logs_health_proxy_port.name
port = var.logs_health_proxy_port.port
}

named_port {
name = var.logs_proxy_port.name
port = var.logs_proxy_port.port
}

auto_healing_policies {
health_check = google_compute_health_check.nomad_check.id
initial_delay_sec = 600
}

# Server is a stateful cluster, so the update strategy used to roll out a new GCE Instance Template must be
# a rolling update.
update_policy {
type = var.environment == "dev" ? "PROACTIVE" : "OPPORTUNISTIC"
minimal_action = var.instance_group_update_policy_minimal_action
max_surge_fixed = var.instance_group_update_policy_max_surge_fixed
max_surge_percent = var.instance_group_update_policy_max_surge_percent
max_unavailable_fixed = var.instance_group_update_policy_max_unavailable_fixed
max_unavailable_percent = var.instance_group_update_policy_max_unavailable_percent
replacement_method = "SUBSTITUTE"
}

base_instance_name = var.cluster_name
target_pools = var.instance_group_target_pools

depends_on = [
google_compute_instance_template.client,
]
}


data "google_compute_image" "source_image" {
family = var.image_family
}
Expand Down Expand Up @@ -151,9 +215,5 @@ resource "google_compute_instance_template" "client" {
# which this Terraform resource depends will also need this lifecycle statement.
lifecycle {
create_before_destroy = true

# TODO: Temporary workaround to avoid unnecessary updates to the instance template.
# This should be removed once cluster size is removed from the metadata
ignore_changes = [metadata]
}
}
16 changes: 4 additions & 12 deletions packages/cluster/client/outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,10 @@ output "cluster_tag_name" {
value = var.cluster_name
}

output "instance_group_id" {
value = google_compute_instance_group_manager.client_cluster.id
}

output "instance_group_url" {
value = google_compute_instance_group_manager.client_cluster.self_link
}

output "instance_group_name" {
value = google_compute_instance_group_manager.client_cluster.name
}

output "instance_group" {
value = google_compute_instance_group_manager.client_cluster.instance_group
}

output "regional_instance_group" {
value = google_compute_region_instance_group_manager.client_cluster.instance_group
}
12 changes: 11 additions & 1 deletion packages/cluster/client/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@ variable "environment" {
type = string
}

variable "gcp_region" {
description = "The GCP region in which the server cluster will be created (e.g. us-central1-a)."
type = string
}

variable "gcp_zone" {
description = "The GCP zone in which the server cluster will be created (e.g. us-central1-a)."
type = string
Expand All @@ -24,7 +29,12 @@ variable "machine_type" {
}

variable "cluster_size" {
description = "The number of nodes to have in the Nomad cluster. We strongly recommended that you use either 3 or 5."
description = "The number of nodes to have in the client cluster."
type = number
}

variable "regional_cluster_size" {
description = "The number of nodes to have in the regional cluster."
type = number
}

Expand Down
12 changes: 8 additions & 4 deletions packages/cluster/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -138,8 +138,11 @@ module "client_cluster" {
cluster_name = "${var.prefix}${var.client_cluster_name}"
cluster_auto_scaling_max = var.client_cluster_auto_scaling_max
cluster_size = var.client_cluster_size
regional_cluster_size = var.client_regional_cluster_size
cluster_tag_name = var.cluster_tag_name
gcp_zone = var.gcp_zone

gcp_region = var.gcp_region
gcp_zone = var.gcp_zone

machine_type = var.client_machine_type
image_family = var.client_image_family
Expand Down Expand Up @@ -302,9 +305,10 @@ module "network" {
domain_name = var.domain_name
additional_domains = var.additional_domains

client_instance_group = module.client_cluster.instance_group
client_proxy_port = var.client_proxy_port
client_proxy_health_port = var.client_proxy_health_port
client_instance_group = module.client_cluster.instance_group
client_regional_instance_group = module.client_cluster.regional_instance_group
client_proxy_port = var.client_proxy_port
client_proxy_health_port = var.client_proxy_health_port

api_instance_group = module.api_cluster.instance_group
build_instance_group = module.build_cluster.instance_group
Expand Down
13 changes: 13 additions & 0 deletions packages/cluster/network/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -483,6 +483,19 @@ module "gce_lb_http_logs" {
max_rate_per_endpoint = null
max_utilization = null
},
{
group = var.client_regional_instance_group
balancing_mode = null
capacity_scaler = null
description = null
max_connections = null
max_connections_per_instance = null
max_connections_per_endpoint = null
max_rate = null
max_rate_per_instance = null
max_rate_per_endpoint = null
max_utilization = null
},
]

iap_config = {
Expand Down
4 changes: 4 additions & 0 deletions packages/cluster/network/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,10 @@ variable "client_instance_group" {
type = string
}

variable "client_regional_instance_group" {
type = string
}

variable "server_instance_group" {
type = string
}
Expand Down
8 changes: 3 additions & 5 deletions packages/cluster/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,6 @@ variable "environment" {
type = string
}

variable "notification_email_secret_version" {
# we're just using this variable to propagate the whole dependency
type = any
}

variable "cloudflare_api_token_secret_name" {
type = string
}
Expand Down Expand Up @@ -111,6 +106,9 @@ variable "client_cluster_size" {
type = number
}

variable "client_regional_cluster_size" {
type = number
}

variable "client_cluster_auto_scaling_max" {
type = number
Expand Down
7 changes: 0 additions & 7 deletions packages/nomad/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -303,12 +303,6 @@ data "google_storage_bucket_object" "orchestrator" {
bucket = var.fc_env_pipeline_bucket_name
}


data "google_compute_machine_types" "client" {
zone = var.gcp_zone
filter = "name = \"${var.client_machine_type}\""
}

data "external" "orchestrator_checksum" {
program = ["bash", "${path.module}/checksum.sh"]

Expand All @@ -320,7 +314,6 @@ data "external" "orchestrator_checksum" {

locals {
orchestrator_envs = {
gcp_zone = var.gcp_zone
port = var.orchestrator_port
proxy_port = var.orchestrator_proxy_port
environment = var.environment
Expand Down
2 changes: 1 addition & 1 deletion packages/nomad/orchestrator.hcl
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ if [ "{{with nomadVar "nomad/jobs" }}{{ .latest_orchestrator_job_id }}{{ end }}"
fi
EOT
}

config {
command = "local/check-placement.sh"
}
Expand Down
20 changes: 0 additions & 20 deletions packages/orchestrator/Makefile
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
ENV := $(shell cat ../../.last_used_env || echo "not-set")
-include ../../.env.${ENV}

client := gcloud compute instances list --format='csv(name)' --project $(GCP_PROJECT_ID) | grep "client"

.PHONY: init
init:
brew install protobuf
Expand Down Expand Up @@ -70,24 +68,6 @@ mock:
mock-nbd:
sudo go run -gcflags=all="-N -l" cmd/mock-nbd/mock.go

.PHONY: killall
killall:
gcloud compute instance-groups list-instances $(PREFIX)orch-client-ig \
--zone=$(GCP_ZONE) \
--project=$(GCP_PROJECT_ID) \
--format="value(instance)" \
--quiet | xargs -I {} -P 5 sh -c "gcloud compute ssh {} --project=$(GCP_PROJECT_ID) --zone=$(GCP_ZONE) --command='sudo killall -9 firecracker'"
@echo "Killing all firecracker processes"

.PHONY: kill-old
kill-old:
gcloud compute instance-groups list-instances $(PREFIX)orch-client-ig \
--zone=$(GCP_ZONE) \
--project=$(GCP_PROJECT_ID) \
--format="value(instance)" \
--quiet | xargs -I {} -P 5 sh -c "gcloud compute ssh {} --project=$(GCP_PROJECT_ID) --zone=$(GCP_ZONE) --command='sudo killall -9 --older-than 24h firecracker'"
@echo "Killing all firecracker processes"

.PHONY: mock-snapshot
mock-snapshot:
sudo TEMPLATE_BUCKET_NAME=$(TEMPLATE_BUCKET_NAME) CONSUL_TOKEN=$(CONSUL_TOKEN) NODE_ID="testclient" go run cmd/mock-snapshot/mock.go -template 5wzg6c91u51yaebviysf -build "f0370054-b669-4d7e-b33b-573d5287c6ef" -alive 1 -count 1
Expand Down
7 changes: 4 additions & 3 deletions tests/integration/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@ connect-orchestrator:
CLIENT_IG=$$(gcloud compute instance-groups list \
--project=$(GCP_PROJECT_ID) \
--filter="name~'^.*client.*'" \
--format='value(name)' \
--zones=$(GCP_ZONE) | head -n1) && \
INSTANCE_ID=$$(gcloud compute instance-groups list-instances "$$CLIENT_IG" --project=$(GCP_PROJECT_ID) --zone=$(GCP_ZONE) --format='value(instance)' | head -n1) && \
--format='value(name)' | head -n1) && \
INSTANCE_INFO=$$(gcloud compute instance-groups list-instances "$$CLIENT_IG" --project=$(GCP_PROJECT_ID) --format='value(instance,zone)' | head -n1) && \
INSTANCE_ID=$$(echo "$$INSTANCE_INFO" | awk '{print $$1}') && \
INSTANCE_ZONE=$$(echo "$$INSTANCE_INFO" | awk '{print $$2}') && \
gcloud compute ssh "$$INSTANCE_ID" --project=$(GCP_PROJECT_ID) --zone=$(GCP_ZONE) -- -NL 5008:localhost:5008 -o PermitLocalCommand=yes -o LocalCommand="echo 'SSH tunnel established'"
Loading
Loading