feat(stackdriver_exporter): Add ErrorLogger for promhttp

Pokom · Pokom · commit 04af0877e23c · 2023-11-03T10:03:30.000-04:00
I had recently experienced #103 and #166 in production and it took quite some time to recognize there was a problem with `stackdriver_exporter` because nothing was logged out to indiciate problems gathering metrics. From my perspective, the pod was healthy and online and I could curl `/metrics` to get results. Grafana Agent however was getting errors when scraping, specifically errors like so: ``` [from Gatherer #2] collected metric "stackdriver_gce_instance_compute_googleapis_com_instance_disk_write_bytes_count" { label:{name:"device_name" value:"REDACTED_FOR_SECURITY"} label:{name:"device_type" value:"permanent"} label:{name:"instance_id" value:"2924941021702260446"} label:{name:"instance_name" value:"REDACTED_FOR_SECURITY"} label:{name:"project_id" value:"REDACTED_FOR_SECURITY"} label:{name:"storage_type" value:"pd-ssd"} label:{name:"unit" value:"By"} label:{name:"zone" value:"us-central1-a"} counter:{value:0} timestamp_ms:1698871080000} was collected before with the same name and label values ``` To help identify the root cause I've added the ability to opt into logging out errors that come from the handler. Specifically, I've created the struct `customPromErrorLogger` that implements the `promhttp.http.Logger` interface. There is a new flag: `monitoring.enable-promhttp-custom-logger` which if it is set to true, then we create an instance of `customPromErrorLogger` and use it as the value for ErrorLogger in `promhttp.Handler{}`. Otherwise, `stackdriver_exporter` works as it did before and does not log out errors collectoing metrics. - refs #103, #166
diff --git a/README.md b/README.md
@@ -76,29 +76,30 @@ If you are still using the legacy [Access scopes][access-scopes], the `https://w
 
 ### Flags
 
-| Flag                                | Required | Default                   | Description                                                                                                                                                                                       |
-| ----------------------------------- | -------- |---------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| `google.project-id`                 | No       | GCloud SDK auto-discovery | Comma seperated list of Google Project IDs                                                                                                                                                        |
-| `google.projects.filter`            | No       |                           | GCloud projects filter expression. See more [here](https://cloud.google.com/sdk/gcloud/reference/projects/list).                                                                                                                                                        |
-| `monitoring.metrics-ingest-delay`   | No       |                           | Offsets metric collection by a delay appropriate for each metric type, e.g. because bigquery metrics are slow to appear                                                                           |
-| `monitoring.drop-delegated-projects` | No       | No                        | Drop metrics from attached projects and fetch `project_id` only.                                                                                                                                  |
-| `monitoring.metrics-type-prefixes`  | Yes      |                           | Comma separated Google Stackdriver Monitoring Metric Type prefixes (see [example][metrics-prefix-example] and [available metrics][metrics-list])                                                  |
-| `monitoring.metrics-interval`       | No       | `5m`                      | Metric's timestamp interval to request from the Google Stackdriver Monitoring Metrics API. Only the most recent data point is used                                                                |
-| `monitoring.metrics-offset`         | No       | `0s`                      | Offset (into the past) for the metric's timestamp interval to request from the Google Stackdriver Monitoring Metrics API, to handle latency in published metrics                                  |
-| `monitoring.filters`                | No       |                           | Formatted string to allow filtering on certain metrics type                                                                                                                                       |
-| `monitoring.aggregate-deltas`       | No       |                           | If enabled will treat all DELTA metrics as an in-memory counter instead of a gauge. Be sure to read [what to know about aggregating DELTA metrics](#what-to-know-about-aggregating-delta-metrics) |
-| `monitoring.aggregate-deltas-ttl`   | No       | `30m`                     | How long should a delta metric continue to be exported and stored after GCP stops producing it. Read [slow moving metrics](#slow-moving-metrics) to understand the problem this attempts to solve |
-| `monitoring.descriptor-cache-ttl`   | No       | `0s`                      | How long should the metric descriptors for a prefixed be cached for                                                                                                                               |
-| `stackdriver.max-retries`           | No       | `0`                       | Max number of retries that should be attempted on 503 errors from stackdriver.                                                                                                                    |
-| `stackdriver.http-timeout`          | No       | `10s`                     |  How long should stackdriver_exporter wait for a result from the Stackdriver API.                                                                                                                 |
-| `stackdriver.max-backoff=`          | No       |                           | Max time between each request in an exp backoff scenario.                                                                                                                                         |
-| `stackdriver.backoff-jitter`        | No       | `1s`                       | The amount of jitter to introduce in a exp backoff scenario.                                                                                                                                      |
-| `stackdriver.retry-statuses`        | No       | `503`                     |  The HTTP statuses that should trigger a retry.                                                                                                                                                   |
-| `web.config.file`                   | No       |                           | [EXPERIMENTAL] Path to configuration file that can enable TLS or authentication.                                                                                                                  |
-| `web.listen-address`                | No       | `:9255`                   | Address to listen on for web interface and telemetry Repeatable for multiple addresses.                                                                                                           |
-| `web.systemd-socket`                | No       |                           | Use systemd socket activation listeners instead of port listeners (Linux only).                                                                                                                   |
-| `web.stackdriver-telemetry-path`    | No       | `/metrics`                | Path under which to expose Stackdriver metrics.                                                                                                                                                   |
-| `web.telemetry-path`                | No       | `/metrics`                | Path under which to expose Prometheus metrics                                                                                                                                                     |
+| Flag                          | Required | Default                   | Description                                                                                                                                                                                      |
+|-------------------------------| -------- |---------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| `google.project-id`           | No       | GCloud SDK auto-discovery | Comma seperated list of Google Project IDs                                                                                                                                                       |
+| `google.projects.filter`      | No       |                           | GCloud projects filter expression. See more [here](https://cloud.google.com/sdk/gcloud/reference/projects/list).                                                                                 |
+| `monitoring.metrics-ingest-delay` | No       |                           | Offsets metric collection by a delay appropriate for each metric type, e.g. because bigquery metrics are slow to appear                                                                          |
+| `monitoring.drop-delegated-projects` | No       | No                        | Drop metrics from attached projects and fetch `project_id` only.                                                                                                                                 |
+| `monitoring.metrics-type-prefixes` | Yes      |                           | Comma separated Google Stackdriver Monitoring Metric Type prefixes (see [example][metrics-prefix-example] and [available metrics][metrics-list])                                                 |
+| `monitoring.metrics-interval` | No       | `5m`                      | Metric's timestamp interval to request from the Google Stackdriver Monitoring Metrics API. Only the most recent data point is used                                                               |
+| `monitoring.metrics-offset`   | No       | `0s`                      | Offset (into the past) for the metric's timestamp interval to request from the Google Stackdriver Monitoring Metrics API, to handle latency in published metrics                                 |
+| `monitoring.filters`          | No       |                           | Formatted string to allow filtering on certain metrics type                                                                                                                                      |
+| `monitoring.aggregate-deltas` | No       |                           | If enabled will treat all DELTA metrics as an in-memory counter instead of a gauge. Be sure to read [what to know about aggregating DELTA metrics](#what-to-know-about-aggregating-delta-metrics) |
+| `monitoring.aggregate-deltas-ttl` | No       | `30m`                     | How long should a delta metric continue to be exported and stored after GCP stops producing it. Read [slow moving metrics](#slow-moving-metrics) to understand the problem this attempts to solve |
+| `monitoring.descriptor-cache-ttl` | No       | `0s`                      | How long should the metric descriptors for a prefixed be cached for                                                                                                                              |
+| `monitoring.enable-promhttp-custom-logger`                 | No       | False                     | If enabled will create a custom error logging handler for promhttp                                                                                                                               |
+| `stackdriver.max-retries`     | No       | `0`                       | Max number of retries that should be attempted on 503 errors from stackdriver.                                                                                                                   |
+| `stackdriver.http-timeout`    | No       | `10s`                     | How long should stackdriver_exporter wait for a result from the Stackdriver API.                                                                                                                 |
+| `stackdriver.max-backoff=`    | No       |                           | Max time between each request in an exp backoff scenario.                                                                                                                                        |
+| `stackdriver.backoff-jitter`  | No       | `1s`                      | The amount of jitter to introduce in a exp backoff scenario.                                                                                                                                     |
+| `stackdriver.retry-statuses`  | No       | `503`                     | The HTTP statuses that should trigger a retry.                                                                                                                                                   |
+| `web.config.file`             | No       |                           | [EXPERIMENTAL] Path to configuration file that can enable TLS or authentication.                                                                                                                 |
+| `web.listen-address`          | No       | `:9255`                   | Address to listen on for web interface and telemetry Repeatable for multiple addresses.                                                                                                          |
+| `web.systemd-socket`          | No       |                           | Use systemd socket activation listeners instead of port listeners (Linux only).                                                                                                                  |
+| `web.stackdriver-telemetry-path` | No       | `/metrics`                | Path under which to expose Stackdriver metrics.                                                                                                                                                  |
+| `web.telemetry-path`          | No       | `/metrics`                | Path under which to expose Prometheus metrics                                                                                                                                                    |
 
 ### TLS and basic authentication
 
diff --git a/stackdriver_exporter.go b/stackdriver_exporter.go
@@ -125,6 +125,10 @@ var (
 	monitoringDescriptorCacheOnlyGoogle = kingpin.Flag(
 		"monitoring.descriptor-cache-only-google", "Only cache descriptors for *.googleapis.com metrics",
 	).Default("true").Bool()
+
+	monitoringEnablePromHttpCustomLogger = kingpin.Flag(
+		"monitoring.enable-promhttp-custom-logger", "Enable custom logger for promhttp",
+	).Default("false").Bool()
 )
 
 func init() {
@@ -236,7 +240,14 @@ func (h *handler) innerHandler(filters map[string]bool) http.Handler {
 	}
 
 	// Delegate http serving to Prometheus client library, which will call collector.Collect.
-	return promhttp.HandlerFor(gatherers, promhttp.HandlerOpts{})
+	opts := promhttp.HandlerOpts{}
+	if *monitoringEnablePromHttpCustomLogger {
+		h.logger.Log("msg", "Enabling custom logger for promhttp")
+		opts = promhttp.HandlerOpts{
+			ErrorLog: NewPromHttpCustomLogger(h.logger),
+		}
+	}
+	return promhttp.HandlerFor(gatherers, opts)
 }
 
 // filterMetricTypePrefixes filters the initial list of metric type prefixes, with the ones coming from an individual
@@ -365,3 +376,17 @@ func parseMetricExtraFilters() []collectors.MetricFilter {
 	}
 	return extraFilters
 }
+
+type customPromErrorLogger struct {
+	logger log.Logger
+}
+
+func (l *customPromErrorLogger) Println(v ...interface{}) {
+	level.Error(l.logger).Log("msg", fmt.Sprint(v...))
+}
+
+func NewPromHttpCustomLogger(logger log.Logger) *customPromErrorLogger {
+	return &customPromErrorLogger{
+		logger: logger,
+	}
+}