wind-tunnel/nomad/scripts/wait_for_jobs.sh at 2a04d3b9e73c556b8fb4370424a12a8ea83ee59f · holochain/wind-tunnel · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#!/bin/bash

set -euo pipefail

# Timeout in seconds (30 minutes). Can be overridden via environment.
TIMEOUT="${TIMEOUT:-1800}"

function check_envset() {
  local var_name="$1"
  if [[ -z "${!var_name:-}" ]]; then
    echo "Environment variable $var_name is not set or is an empty string." >&2
    exit 1
  fi
}

function check_command() {
  local cmd="$1"
  if ! command -v "$cmd" >/dev/null 2>&1; then
    echo "Command '$cmd' not found in PATH." >&2
    exit 1
  fi
}

function get_nomad_status() {
  local alloc_id="$1"
  nomad alloc status -json "${alloc_id}"
}

function get_status() {
    local nomad_status="$1"
    # Fetch JSON and extract ClientStatus; fail fast if the command errors.
    local client_status
    if ! client_status="$(echo "$nomad_status" | jq -r '.ClientStatus')"; then
        echo "Failed to retrieve status for allocation ID: $alloc_id" >&2
        exit 1
    fi
    if [[ -z "$client_status" || "$client_status" == "null" ]]; then
        echo "No job found with allocation ID: $alloc_id" >&2
        exit 1
    fi
    echo "$client_status"
}

# See https://developer.hashicorp.com/nomad/commands/job/status#running
function is_running() {
    local status="$1"
    case "$status" in
      running)
        return 0 ;;  # still running
      *)
        return 1 ;;  # not running
    esac
}

function is_run_success() {
    local status="$1"
    [[ "$status" == "complete" ]]
}

function print_failed_tasks_and_logs() {
  local alloc_id="$1"
  local status="$2"

  # get failed tasks and create an object for each status with the task name and the error message
  failed_tasks=$(jq -r '
    .TaskStates
    | to_entries[]
    | . as $entry
    | $entry.value.Events[]?
    | {
      task: $entry.key,
      message: .DisplayMessage,
      details: .Details
    }
  ' <<< "$status")

  # get logs for each (unique) task
  local tasks
  tasks="$(echo "$failed_tasks" | jq -r '.task' | sort -u)"

  for task in $tasks; do
    echo "Fetching logs for task: $task"
    nomad alloc logs -stderr "$alloc_id" "$task" || echo "Failed to fetch stderr logs for task: $task"
    nomad alloc logs -stdout "$alloc_id" "$task" || echo "Failed to fetch stdout logs for task: $task"
  done

  echo "Failed task: $failed_tasks"
}

function wait_for_job() {
    local scenario_name="$1"
    local alloc_id="$2"

    echo "Waiting for job: $scenario_name ($alloc_id)"

    ELAPSED_SECS=0
    # Run until timeout or no scenario is still running
    while true; do
        local nomad_status
        if ! nomad_status="$(get_nomad_status "$alloc_id")"; then
            echo "Failed to fetch Nomad status for $scenario_name ($alloc_id)" >&2
            exit 1
        fi
        local status
        status="$(get_status "$nomad_status")"

        if is_running "$status"; then
            echo "Scenario $scenario_name ($alloc_id) is still running (status=$status) (elapsed: $ELAPSED_SECS seconds)."
            sleep 1
            ELAPSED_SECS=$((ELAPSED_SECS + 1))
            if [[ $ELAPSED_SECS -gt $TIMEOUT ]]; then
                echo "Timeout reached after $TIMEOUT seconds."
                exit 255
            fi
            continue
        fi

        # Job has completed at this point, either successfully or failed.
        if is_run_success "$status"; then
            echo "Scenario $scenario_name ($alloc_id) completed successfully in $ELAPSED_SECS seconds."
            return 0
        else
            echo "Scenario $scenario_name ($alloc_id) finished with status=$status after $ELAPSED_SECS seconds."
            print_failed_tasks_and_logs "$alloc_id" "$nomad_status"
            return 1
        fi
    done
}


# verify NOMAD variables are set
check_envset "NOMAD_CACERT"
check_envset "NOMAD_ADDR"
check_envset "NOMAD_TOKEN"

# verify required tools are available
check_command "nomad"
check_command "jq"

# Scenario name must be passed as an argument; at least one allocation ID must be provided
if [[ $# -lt 2 ]]; then
  echo "Usage: $0 <scenario_name> <allocation_id> [more_alloc_ids...]"
  exit 1
fi

SCENARIO_NAME="$1"

shift # Remove the first argument (scenario name)

# Count total number of allocations and track failures
TOTAL_ALLOCATIONS=$#
FAILED_ALLOCATIONS=0

# Process each allocation ID passed as arguments
while [[ $# -gt 0 ]]; do
    # Get next allocation ID from arguments
    alloc_id="$1"

    if ! wait_for_job "$SCENARIO_NAME" "$alloc_id"; then
        FAILED_ALLOCATIONS=$((FAILED_ALLOCATIONS + 1))
    fi

    shift # Remove the processed allocation ID
done

# If all allocations failed, exit with error
if [[ $FAILED_ALLOCATIONS -eq $TOTAL_ALLOCATIONS ]]; then
    echo "Error: All $TOTAL_ALLOCATIONS allocation(s) failed." >&2
    exit 1
fi

exit 0