|
| 1 | +package main |
| 2 | + |
| 3 | +import ( |
| 4 | + "database/sql" |
| 5 | + "fmt" |
| 6 | + "log" |
| 7 | + "net/http" |
| 8 | + "os" |
| 9 | + "strconv" |
| 10 | + "time" |
| 11 | + |
| 12 | + _ "github.com/lib/pq" // PostgreSQL driver |
| 13 | + "github.com/prometheus/client_golang/prometheus" |
| 14 | + "github.com/prometheus/client_golang/prometheus/promhttp" |
| 15 | +) |
| 16 | + |
| 17 | +const ( |
| 18 | + tableName = "data" |
| 19 | + // dbURLEnvVar is the environment variable name for the database connection string. |
| 20 | + dbURLEnvVar = "CONNECTION_STRING" |
| 21 | + // requiredRecentReadings is the number of recent KPI error readings to consider for the kanary_up. |
| 22 | + requiredRecentReadings = 3 |
| 23 | + // Amount of time in seconds allowed without a new entry in the database |
| 24 | + delayInSeconds = 10800 |
| 25 | +) |
| 26 | + |
| 27 | +var ( |
| 28 | + kanaryUpMetric = prometheus.NewGaugeVec( |
| 29 | + prometheus.GaugeOpts{ |
| 30 | + Name: "kanary_up", |
| 31 | + // Help string updated to reflect the new logic: UP if at least one error reading <= 0, DOWN if all > 0. |
| 32 | + Help: fmt.Sprintf("Kanary signal: 1 if at least one of last %d error readings is 0 or less, 0 if all last %d error readings are greater than 0. Only updated when %d valid data points are available.", requiredRecentReadings, requiredRecentReadings, requiredRecentReadings), |
| 33 | + }, |
| 34 | + []string{"instance"}, |
| 35 | + ) |
| 36 | + |
| 37 | + kanaryErrorMetric = prometheus.NewGaugeVec( |
| 38 | + prometheus.GaugeOpts{ |
| 39 | + Name: "kanary_error", |
| 40 | + Help: fmt.Sprintf("Binary indicator of an error in processing for Kanary signal (1 if interrupted, 0 otherwise). An error prevents kanary_up from being updated. The 'reason' label provides details on the error type."), |
| 41 | + }, |
| 42 | + []string{"instance", "reason"}, |
| 43 | + ) |
| 44 | + |
| 45 | + // targetClusters is the list of cluster API URLs to monitor. |
| 46 | + targetClusters = []string{ |
| 47 | + // Private Clusters |
| 48 | + "stone-stage-p01", |
| 49 | + "stone-prod-p01", |
| 50 | + "stone-prod-p02", |
| 51 | + // Public Clusters |
| 52 | + "stone-stg-rh01", |
| 53 | + "stone-prd-rh01", |
| 54 | + } |
| 55 | + |
| 56 | + // --- Db Queries |
| 57 | + |
| 58 | + // Entries in Db check |
| 59 | + datapointsCountQuery = fmt.Sprintf(` |
| 60 | + SELECT COUNT(*) |
| 61 | + FROM %s |
| 62 | + WHERE |
| 63 | + label_values->>'.metadata.env.MEMBER_CLUSTER' LIKE $1 |
| 64 | + AND ( label_values->>'.repo_type' = 'nodejs-devfile-sample' OR NOT (label_values ? '.repo_type') ); |
| 65 | + `, tableName) |
| 66 | + |
| 67 | + // Delay check |
| 68 | + delayCheckQuery = fmt.Sprintf(` |
| 69 | + WITH LatestRowByStart AS ( |
| 70 | + SELECT |
| 71 | + -- The replace is to comply with the Standard ISO 8601 format |
| 72 | + EXTRACT(epoch FROM (REPLACE(label_values->>'.ended', ',', '.'))::timestamptz) AS ended_epoch, |
| 73 | + (EXTRACT(epoch FROM CURRENT_TIMESTAMP) - %d) AS earliest_allowed_ended_epoch |
| 74 | + FROM |
| 75 | + %s |
| 76 | + WHERE |
| 77 | + label_values->>'.metadata.env.MEMBER_CLUSTER' LIKE $1 |
| 78 | + AND (label_values->>'.repo_type' = 'nodejs-devfile-sample' OR NOT (label_values ? '.repo_type')) |
| 79 | + ORDER BY |
| 80 | + EXTRACT(epoch FROM start) DESC |
| 81 | + LIMIT 1 |
| 82 | + ) |
| 83 | + SELECT COUNT(*) |
| 84 | + FROM LatestRowByStart |
| 85 | + WHERE ended_epoch >= earliest_allowed_ended_epoch |
| 86 | + `, delayInSeconds, tableName) |
| 87 | + |
| 88 | + // Query fetches KPI error values, filtering by member cluster using LIKE and optionally by repo_type. |
| 89 | + dataQuery = fmt.Sprintf(` |
| 90 | + SELECT |
| 91 | + label_values->>'__results_measurements_KPI_errors' AS kpi_error_value |
| 92 | + FROM |
| 93 | + %s |
| 94 | + WHERE |
| 95 | + label_values->>'.metadata.env.MEMBER_CLUSTER' LIKE $1 |
| 96 | + AND ( label_values->>'.repo_type' = 'nodejs-devfile-sample' OR NOT (label_values ? '.repo_type') ) |
| 97 | + ORDER BY |
| 98 | + EXTRACT(epoch FROM start) DESC |
| 99 | + LIMIT %d; |
| 100 | + `, tableName, requiredRecentReadings) |
| 101 | +) |
| 102 | + |
| 103 | +// getKpiErrorReadings fetches and validates the last 'requiredRecentReadings' KPI error counts for a given cluster. |
| 104 | +// It returns a slice of int64 values if successful, an internal status string for the error reason, and an error if any issue occurs. |
| 105 | +func getKpiErrorReadings(db *sql.DB, clusterName string) ([]int64, string, error) { |
| 106 | + clusterSubStringPattern := "%" + clusterName + "%" |
| 107 | + |
| 108 | + // Sufficient datapoints for cluster in db check |
| 109 | + var datapointsCount int |
| 110 | + err := db.QueryRow(datapointsCountQuery, clusterSubStringPattern).Scan(&datapointsCount) |
| 111 | + if err != nil { |
| 112 | + if err == sql.ErrNoRows { |
| 113 | + datapointsCount = 0 |
| 114 | + } else { |
| 115 | + return nil, "db_error", fmt.Errorf("database count query failed for cluster %s: %w", clusterName, err) |
| 116 | + } |
| 117 | + } |
| 118 | + |
| 119 | + if datapointsCount == 0 { |
| 120 | + return nil, "db_error", fmt.Errorf("database count query failed for cluster %s: %w", clusterName, err) |
| 121 | + } |
| 122 | + |
| 123 | + |
| 124 | + // Latest datapoint entry not delayed check |
| 125 | + var delayConditionMetCount int |
| 126 | + err = db.QueryRow(delayCheckQuery, clusterSubStringPattern).Scan(&delayConditionMetCount) |
| 127 | + if err != nil { |
| 128 | + return nil, "db_error", fmt.Errorf("delay condition check query failed for cluster %s: %w", clusterName, err) |
| 129 | + } |
| 130 | + |
| 131 | + if delayConditionMetCount == 0 { |
| 132 | + return nil, "no_test_results", fmt.Errorf("last datapoint for cluster %s was older than %d hours", clusterName, (delayInSeconds/60)/60) |
| 133 | + } |
| 134 | + |
| 135 | + // KPI Errors check for the health of the given cluster |
| 136 | + rows, err := db.Query(dataQuery, clusterSubStringPattern) |
| 137 | + if err != nil { |
| 138 | + return nil, "db_error", fmt.Errorf("database query failed for cluster %s: %w", clusterName, err) |
| 139 | + } |
| 140 | + defer rows.Close() |
| 141 | + |
| 142 | + var parsedErrorReadings []int64 |
| 143 | + var rawValuesForLog []string |
| 144 | + |
| 145 | + for rows.Next() { |
| 146 | + var kpiErrorValueStr sql.NullString |
| 147 | + if err := rows.Scan(&kpiErrorValueStr); err != nil { |
| 148 | + // Error during row scan is considered a database error. |
| 149 | + return nil, "db_error", fmt.Errorf("failed to scan row for cluster %s: %w", clusterName, err) |
| 150 | + } |
| 151 | + |
| 152 | + if !kpiErrorValueStr.Valid || kpiErrorValueStr.String == "" { |
| 153 | + // NULL or empty values are data processing issues. |
| 154 | + return nil, "db_error", fmt.Errorf("found NULL or empty kpi_error_value in one of the last %d rows for cluster %s. Raw values so far: %v", requiredRecentReadings, clusterName, rawValuesForLog) |
| 155 | + } |
| 156 | + rawValuesForLog = append(rawValuesForLog, kpiErrorValueStr.String) |
| 157 | + |
| 158 | + kpiErrorCount, err := strconv.ParseInt(kpiErrorValueStr.String, 10, 64) |
| 159 | + if err != nil { |
| 160 | + // Failure to parse the error count is a data processing issue. |
| 161 | + return nil, "db_error", fmt.Errorf("failed to parse kpi_error_value '%s' as integer for cluster %s: %w", kpiErrorValueStr.String, clusterName, err) |
| 162 | + } |
| 163 | + parsedErrorReadings = append(parsedErrorReadings, kpiErrorCount) |
| 164 | + } |
| 165 | + |
| 166 | + if err := rows.Err(); err != nil { |
| 167 | + // Errors encountered during iteration (e.g., network issues during streaming) are database errors. |
| 168 | + return nil, "db_error", fmt.Errorf("error during row iteration for cluster %s: %w", clusterName, err) |
| 169 | + } |
| 170 | + |
| 171 | + if len(parsedErrorReadings) < requiredRecentReadings { |
| 172 | + // Not enough data points is considered a database/query issue. |
| 173 | + return nil, "db_error", fmt.Errorf("expected %d data points for cluster %s, but query returned %d. Raw values: %v", requiredRecentReadings, clusterName, len(parsedErrorReadings), rawValuesForLog) |
| 174 | + } |
| 175 | + |
| 176 | + return parsedErrorReadings, "data_ok", nil |
| 177 | +} |
| 178 | + |
| 179 | +// fetchAndExportMetrics orchestrates fetching data and updating Prometheus metrics for all target clusters. |
| 180 | +func fetchAndExportMetrics(db *sql.DB) { |
| 181 | + for _, clusterName := range targetClusters { |
| 182 | + reasonForError := "" |
| 183 | + kpiErrorReadings, internalStatusMsg, err := getKpiErrorReadings(db, clusterName) |
| 184 | + |
| 185 | + if err != nil { |
| 186 | + // An error occurred (DB error, parse error, insufficient data, etc.). |
| 187 | + reasonForError = internalStatusMsg |
| 188 | + log.Printf("Error for cluster '%s': %s. Error details: %v", clusterName, reasonForError, err) |
| 189 | + log.Printf("kanary_up metric will be set to 1 for cluster %s due to kanary_error != 0: %s. Error details: %v", clusterName, reasonForError, err) |
| 190 | + kanaryErrorMetric.WithLabelValues(clusterName, reasonForError).Set(1) |
| 191 | + // Keep kanary up, incase of kanary_error |
| 192 | + kanaryUpMetric.WithLabelValues(clusterName).Set(1) |
| 193 | + } else { |
| 194 | + // Successfully retrieved and parsed data; no error. |
| 195 | + kanaryErrorMetric.WithLabelValues(clusterName, "db_error").Set(0) |
| 196 | + kanaryErrorMetric.WithLabelValues(clusterName, "no_test_results").Set(0) |
| 197 | + |
| 198 | + // Determine signal status: UP if at least one error reading is <= 0, DOWN if all are > 0. |
| 199 | + down := true |
| 200 | + if len(kpiErrorReadings) == 0 { |
| 201 | + // This case should ideally be caught by getKpiErrorReadings returning an error. |
| 202 | + // If it somehow occurs, treat as not all readings being strictly positive. |
| 203 | + down = false |
| 204 | + } else { |
| 205 | + for _, errorCount := range kpiErrorReadings { |
| 206 | + if errorCount <= 0 { |
| 207 | + down = false |
| 208 | + break |
| 209 | + } |
| 210 | + } |
| 211 | + } |
| 212 | + |
| 213 | + if down { |
| 214 | + // All recent error readings are > 0, so the signal is DOWN. |
| 215 | + kanaryUpMetric.WithLabelValues(clusterName).Set(0) |
| 216 | + log.Printf("KO: Kanary signal for cluster '%s' is DOWN (all last %d error readings > 0): %v. Error status: %s", clusterName, requiredRecentReadings, kpiErrorReadings, reasonForError) |
| 217 | + } else { |
| 218 | + // At least one recent error reading is <= 0, so the signal is UP. |
| 219 | + kanaryUpMetric.WithLabelValues(clusterName).Set(1) |
| 220 | + log.Printf("OK: Kanary signal for cluster '%s' is UP: %v.", clusterName, kpiErrorReadings) |
| 221 | + } |
| 222 | + } |
| 223 | + } |
| 224 | +} |
| 225 | + |
| 226 | +func main() { |
| 227 | + databaseURL := os.Getenv(dbURLEnvVar) |
| 228 | + if databaseURL == "" { |
| 229 | + log.Fatalf("FATAL: Environment variable %s is not set or is empty. Example: export %s=\"postgres://user:pass@host:port/db?sslmode=disable\"", dbURLEnvVar, dbURLEnvVar) |
| 230 | + } |
| 231 | + |
| 232 | + db, err := sql.Open("postgres", databaseURL) |
| 233 | + if err != nil { |
| 234 | + log.Fatalf("FATAL: Error connecting to the database using DSN from %s: %v", dbURLEnvVar, err) |
| 235 | + } |
| 236 | + defer db.Close() |
| 237 | + |
| 238 | + if err = db.Ping(); err != nil { |
| 239 | + log.Fatalf("FATAL: Error pinging database: %v", err) |
| 240 | + } |
| 241 | + log.Println("Successfully connected to the database.") |
| 242 | + |
| 243 | + // Create a new PedanticRegistry. |
| 244 | + reg := prometheus.NewPedanticRegistry() |
| 245 | + |
| 246 | + // Register metrics with the new PedanticRegistry. |
| 247 | + reg.MustRegister(kanaryUpMetric) |
| 248 | + reg.MustRegister(kanaryErrorMetric) |
| 249 | + |
| 250 | + // Expose the registered metrics via HTTP. |
| 251 | + // Use promhttp.HandlerFor to specify the PedanticRegistry. |
| 252 | + http.Handle("/metrics", promhttp.HandlerFor(reg, promhttp.HandlerOpts{})) |
| 253 | + |
| 254 | + go func() { |
| 255 | + log.Println("Prometheus exporter starting on :8000/metrics ...") |
| 256 | + if err := http.ListenAndServe(":8000", nil); err != nil { |
| 257 | + log.Fatalf("FATAL: Error starting Prometheus HTTP server: %v", err) |
| 258 | + } |
| 259 | + }() |
| 260 | + |
| 261 | + log.Println("Performing initial metrics fetch...") |
| 262 | + fetchAndExportMetrics(db) |
| 263 | + log.Println("Initial metrics fetch complete.") |
| 264 | + |
| 265 | + // Periodically fetch metrics. The interval could be made configurable. |
| 266 | + scrapeInterval := 300 * time.Second |
| 267 | + log.Printf("Starting periodic metrics fetch every %v.", scrapeInterval) |
| 268 | + ticker := time.NewTicker(scrapeInterval) |
| 269 | + defer ticker.Stop() |
| 270 | + |
| 271 | + for range ticker.C { |
| 272 | + log.Println("Fetching and exporting metrics...") |
| 273 | + fetchAndExportMetrics(db) |
| 274 | + log.Println("Metrics fetch complete.") |
| 275 | + } |
| 276 | +} |
0 commit comments