Skip to content

Commit ec66fbd

Browse files
Orchestrator rolling updates with job definiton (#564)
Co-authored-by: Tomas Valenta <[email protected]>
1 parent b573607 commit ec66fbd

File tree

4 files changed

+88
-13
lines changed

4 files changed

+88
-13
lines changed

packages/nomad/main.tf

Lines changed: 42 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -317,8 +317,9 @@ data "external" "orchestrator_checksum" {
317317
}
318318
}
319319

320-
resource "nomad_job" "orchestrator" {
321-
jobspec = templatefile("${path.module}/orchestrator.hcl", {
320+
321+
locals {
322+
orchestrator_envs = {
322323
gcp_zone = var.gcp_zone
323324
port = var.orchestrator_port
324325
proxy_port = var.orchestrator_proxy_port
@@ -336,7 +337,45 @@ resource "nomad_job" "orchestrator" {
336337
clickhouse_username = var.clickhouse_username
337338
clickhouse_password = var.clickhouse_password
338339
clickhouse_database = var.clickhouse_database
339-
})
340+
}
341+
342+
orchestrator_job_check = templatefile("${path.module}/orchestrator.hcl", merge(
343+
local.orchestrator_envs,
344+
{
345+
latest_orchestrator_job_id = "placeholder",
346+
}
347+
))
348+
}
349+
350+
351+
352+
resource "random_id" "orchestrator_job" {
353+
keepers = {
354+
# Use both the orchestrator job (including vars) definition and the latest orchestrator checksum to detect changes
355+
orchestrator_job = sha256("${local.orchestrator_job_check}-${data.external.orchestrator_checksum.result.hex}")
356+
}
357+
358+
byte_length = 8
359+
}
360+
361+
resource "nomad_variable" "orchestrator_hash" {
362+
path = "nomad/jobs"
363+
items = {
364+
latest_orchestrator_job_id = random_id.orchestrator_job.hex
365+
}
366+
}
367+
368+
resource "nomad_job" "orchestrator" {
369+
deregister_on_id_change = false
370+
371+
jobspec = templatefile("${path.module}/orchestrator.hcl", merge(
372+
local.orchestrator_envs,
373+
{
374+
latest_orchestrator_job_id = random_id.orchestrator_job.hex
375+
}
376+
))
377+
378+
depends_on = [nomad_variable.orchestrator_hash]
340379
}
341380

342381
data "google_storage_bucket_object" "template_manager" {

packages/nomad/orchestrator.hcl

Lines changed: 36 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,10 @@
1-
job "orchestrator" {
1+
job "orchestrator-${latest_orchestrator_job_id}" {
22
type = "system"
3-
datacenters = ["${gcp_zone}"]
3+
node_pool = "default"
44

55
priority = 90
66

77
group "client-orchestrator" {
8-
network {
9-
port "orchestrator" {
10-
static = "${port}"
11-
}
12-
}
13-
148
service {
159
name = "orchestrator"
1610
port = "${port}"
@@ -30,9 +24,42 @@ job "orchestrator" {
3024
port = "${proxy_port}"
3125
}
3226

27+
task "check-placement" {
28+
driver = "raw_exec"
29+
30+
lifecycle {
31+
hook = "prestart"
32+
sidecar = false
33+
}
34+
35+
restart {
36+
attempts = 0
37+
}
38+
39+
template {
40+
destination = "local/check-placement.sh"
41+
data = <<EOT
42+
#!/bin/bash
43+
44+
if [ "{{with nomadVar "nomad/jobs" }}{{ .latest_orchestrator_job_id }}{{ end }}" != "${latest_orchestrator_job_id}" ]; then
45+
echo "This orchestrator is not the latest version, exiting"
46+
exit 1
47+
fi
48+
EOT
49+
}
50+
51+
config {
52+
command = "local/check-placement.sh"
53+
}
54+
}
55+
3356
task "start" {
3457
driver = "raw_exec"
3558

59+
restart {
60+
attempts = 0
61+
}
62+
3663
env {
3764
NODE_ID = "$${node.unique.name}"
3865
CONSUL_TOKEN = "${consul_acl_token}"
@@ -50,7 +77,7 @@ job "orchestrator" {
5077

5178
config {
5279
command = "/bin/bash"
53-
args = ["-c", " chmod +x local/orchestrator && local/orchestrator --port ${port} --proxy-port ${proxy_port}"]
80+
args = ["-c", " chmod +x local/orchestrator && local/orchestrator --port ${port} --proxy-port ${proxy_port} --wait 0"]
5481
}
5582

5683
artifact {

packages/orchestrator/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ run-debug:
4646
ORCHESTRATOR_SERVICES=$(ORCHESTRATOR_SERVICES) \
4747
GCP_DOCKER_REPOSITORY_NAME=$(GCP_DOCKER_REPOSITORY_NAME) \
4848
GOOGLE_SERVICE_ACCOUNT_BASE64=$(GOOGLE_SERVICE_ACCOUNT_BASE64) \
49-
./bin/orchestrator
49+
./bin/orchestrator --wait 0
5050

5151
.PHONY: upload/orchestrator
5252
upload/orchestrator:

packages/orchestrator/main.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import (
1212
"os/signal"
1313
"slices"
1414
"syscall"
15+
"time"
1516

1617
"go.opentelemetry.io/otel"
1718
"go.uber.org/zap"
@@ -41,6 +42,7 @@ type Closeable interface {
4142
const (
4243
defaultPort = 5008
4344
defaultProxyPort = 5007
45+
defaultWait = 30
4446

4547
version = "0.1.0"
4648

@@ -53,6 +55,7 @@ var commitSHA string
5355
func main() {
5456
port := flag.Uint("port", defaultPort, "orchestrator server port")
5557
proxyPort := flag.Uint("proxy-port", defaultProxyPort, "orchestrator proxy port")
58+
wait := flag.Uint("wait", defaultWait, "orchestrator proxy port")
5659
flag.Parse()
5760

5861
if *port > math.MaxUint16 {
@@ -63,6 +66,12 @@ func main() {
6366
log.Fatalf("%d is larger than maximum possible proxy port %d", proxyPort, math.MaxInt16)
6467
}
6568

69+
// TODO: Remove after the orchestrator is fully migrated to the new job definition
70+
if *wait > 0 {
71+
log.Printf("waiting %d seconds before starting orchestrator", *wait)
72+
time.Sleep(time.Duration(*wait) * time.Second)
73+
}
74+
6675
success := run(*port, *proxyPort)
6776

6877
log.Println("Stopping orchestrator, success:", success)

0 commit comments

Comments
 (0)