Skip to content

Commit e9d5fb1

Browse files
authored
Merge pull request #22 from gitpod-io/nv/managed-metrics-push
feat: support managed endpoint direct push in config-reloader
2 parents f436aea + 9cf3257 commit e9d5fb1

1 file changed

Lines changed: 136 additions & 21 deletions

File tree

files/runner-cloud-init.tftpl

Lines changed: 136 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -423,12 +423,14 @@ write_files:
423423
REMOTE_USER=$(echo "$SECRET_DATA" | jq -r '.user // ""')
424424
REMOTE_PASSWORD=$(echo "$SECRET_DATA" | jq -r '.password // ""')
425425
LOCAL_REMOTE_WRITE_URL=$(echo "$SECRET_DATA" | jq -r '.localRemoteWriteUrl // ""')
426+
MANAGED_ENDPOINT_URL=$(echo "$SECRET_DATA" | jq -r '.managedEndpointUrl // ""')
427+
MANAGED_BEARER_TOKEN=$(echo "$SECRET_DATA" | jq -r '.managedBearerToken // ""')
426428
RUNNER_ID=$(echo "$SECRET_DATA" | jq -r '.runnerId // ""')
427429
ORGANIZATION_ID=$(echo "$SECRET_DATA" | jq -r '.organizationId // ""')
428430
# Read allowlist as a bash array of prefixes
429431
mapfile -t ALLOWLIST_PREFIXES < <(echo "$SECRET_DATA" | jq -r '.allowlistPrefixes // [] | .[]')
430432

431-
log "Metrics enabled: $ENABLE_METRICS, URL: $REMOTE_URL, local remote write: $LOCAL_REMOTE_WRITE_URL, runner_id: $RUNNER_ID, organization_id: $ORGANIZATION_ID, allowlist prefixes: $${#ALLOWLIST_PREFIXES[@]}"
433+
log "Metrics enabled: $ENABLE_METRICS, URL: $REMOTE_URL, managed endpoint: $MANAGED_ENDPOINT_URL, local remote write: $LOCAL_REMOTE_WRITE_URL, runner_id: $RUNNER_ID, organization_id: $ORGANIZATION_ID, allowlist prefixes: $${#ALLOWLIST_PREFIXES[@]}"
432434

433435
# Generate final configuration using template substitution
434436
sed -e "s/{{INSTANCE_NAME}}/$INSTANCE_NAME/g" \
@@ -464,33 +466,57 @@ write_files:
464466
fi
465467
fi
466468

467-
# Add local remote write target for managed metrics pipeline
468-
if [ -n "$LOCAL_REMOTE_WRITE_URL" ]; then
469+
# Build allowlist regex once — reused by both managed and local targets.
470+
ALLOWLIST_REGEX=""
471+
if [ $${#ALLOWLIST_PREFIXES[@]} -gt 0 ]; then
472+
ALLOWLIST_REGEX="("
473+
for i in "$${!ALLOWLIST_PREFIXES[@]}"; do
474+
if [ "$i" -gt 0 ]; then ALLOWLIST_REGEX+="|"; fi
475+
ALLOWLIST_REGEX+="$${ALLOWLIST_PREFIXES[$i]}.*"
476+
done
477+
ALLOWLIST_REGEX+=")"
478+
fi
479+
480+
# Helper: append write_relabel_configs for the allowlist.
481+
append_allowlist_relabel() {
482+
if [ -n "$ALLOWLIST_REGEX" ]; then
483+
echo " write_relabel_configs:" >> /tmp/prometheus.yml.new
484+
echo " - source_labels: [__name__]" >> /tmp/prometheus.yml.new
485+
echo " regex: '$ALLOWLIST_REGEX'" >> /tmp/prometheus.yml.new
486+
echo " action: keep" >> /tmp/prometheus.yml.new
487+
fi
488+
}
489+
490+
# Add managed endpoint direct push (preferred over local receiver).
491+
# Uses a scoped JWT to push metrics directly to the management plane.
492+
if [ -n "$MANAGED_ENDPOINT_URL" ] && [ -n "$MANAGED_BEARER_TOKEN" ]; then
493+
log "Adding managed endpoint remote write target: $MANAGED_ENDPOINT_URL"
494+
if [ "$HAS_REMOTE_WRITE" = "false" ]; then
495+
echo "" >> /tmp/prometheus.yml.new
496+
echo "remote_write:" >> /tmp/prometheus.yml.new
497+
HAS_REMOTE_WRITE=true
498+
fi
499+
echo " - url: $MANAGED_ENDPOINT_URL" >> /tmp/prometheus.yml.new
500+
echo " authorization:" >> /tmp/prometheus.yml.new
501+
echo " type: Bearer" >> /tmp/prometheus.yml.new
502+
echo " credentials: $MANAGED_BEARER_TOKEN" >> /tmp/prometheus.yml.new
503+
append_allowlist_relabel
504+
505+
# Audit: send the same filtered payload to the local audit receiver
506+
# which persists each write to GCS for customer audit trails.
507+
log "Adding metrics audit receiver remote write target"
508+
echo " - url: http://127.0.0.1:9095/write" >> /tmp/prometheus.yml.new
509+
append_allowlist_relabel
510+
elif [ -n "$LOCAL_REMOTE_WRITE_URL" ]; then
511+
# Fallback: local remote write target for managed metrics pipeline.
469512
log "Adding local remote write target: $LOCAL_REMOTE_WRITE_URL"
470513
if [ "$HAS_REMOTE_WRITE" = "false" ]; then
471514
echo "" >> /tmp/prometheus.yml.new
472515
echo "remote_write:" >> /tmp/prometheus.yml.new
473516
HAS_REMOTE_WRITE=true
474517
fi
475518
echo " - url: $LOCAL_REMOTE_WRITE_URL" >> /tmp/prometheus.yml.new
476-
477-
# Add write_relabel_configs to filter by allowlist prefixes.
478-
# Only metrics matching these prefixes are forwarded to the
479-
# managed metrics pipeline. Uses a single regex with alternation.
480-
if [ $${#ALLOWLIST_PREFIXES[@]} -gt 0 ]; then
481-
# Build regex: (prefix1.*|prefix2.*|...)
482-
REGEX="("
483-
for i in "$${!ALLOWLIST_PREFIXES[@]}"; do
484-
if [ "$i" -gt 0 ]; then REGEX+="|"; fi
485-
REGEX+="$${ALLOWLIST_PREFIXES[$i]}.*"
486-
done
487-
REGEX+=")"
488-
489-
echo " write_relabel_configs:" >> /tmp/prometheus.yml.new
490-
echo " - source_labels: [__name__]" >> /tmp/prometheus.yml.new
491-
echo " regex: '$REGEX'" >> /tmp/prometheus.yml.new
492-
echo " action: keep" >> /tmp/prometheus.yml.new
493-
fi
519+
append_allowlist_relabel
494520
fi
495521

496522
# Check if configuration changed
@@ -576,6 +602,8 @@ write_files:
576602
INSTANCE_GROUP_NAME=${INSTANCE_GROUP_NAME}
577603
BUILD_CACHE_BUCKET=${BUILD_CACHE_BUCKET}
578604
GITPOD_DEVELOPMENT_VERSION=${DEVELOPMENT_VERSION}
605+
MANAGED_METRICS_DIRECT_PUSH=true
606+
RUNNER_ASSETS_BUCKET_NAME=${RUNNER_ASSETS_BUCKET_NAME}
579607
PUBSUB_SUBSCRIPTION_ID=${PUBSUB_SUBSCRIPTION_ID}
580608
AUTH_PROXY_URL=${AUTH_PROXY_URL}
581609
RUNNER_LOGS_URL="${RUNNER_LOGS_URL}"
@@ -682,6 +710,7 @@ write_files:
682710
--env https_proxy=${HTTPS_PROXY} \
683711
--env all_proxy=${ALL_PROXY} \
684712
--env GITPOD_DEVELOPMENT_VERSION=${DEVELOPMENT_VERSION} \
713+
--env MANAGED_METRICS_DIRECT_PUSH=true \
685714
--env GITPOD_TERRAFORM_MODULE_VERSION=${TERRAFORM_MODULE_VERSION} \
686715
--env no_proxy=${NO_PROXY} \
687716
%{ if HAS_TRUST_BUNDLE ~}
@@ -861,6 +890,91 @@ write_files:
861890
# Execute main function
862891
main "$@"
863892

893+
# Metrics audit receiver — accepts Prometheus remote_write POSTs and
894+
# writes each payload to GCS so customers can audit exactly what data
895+
# leaves their network. Listens on 127.0.0.1:9095.
896+
- path: /var/lib/gitpod/metrics-audit-receiver.py
897+
permissions: '0755'
898+
content: |
899+
#!/usr/bin/env python3
900+
"""Receives Prometheus remote_write payloads and writes them to GCS."""
901+
import os
902+
import subprocess
903+
import sys
904+
import time
905+
from http.server import HTTPServer, BaseHTTPRequestHandler
906+
907+
LISTEN_ADDR = "127.0.0.1"
908+
LISTEN_PORT = 9095
909+
BUCKET = os.environ.get("RUNNER_ASSETS_BUCKET_NAME", "")
910+
RUNNER_ID = os.environ.get("RUNNER_ID", "")
911+
912+
class AuditHandler(BaseHTTPRequestHandler):
913+
def do_POST(self):
914+
length = int(self.headers.get("Content-Length", 0))
915+
if length == 0:
916+
self.send_response(204)
917+
self.end_headers()
918+
return
919+
920+
body = self.rfile.read(length)
921+
now = time.gmtime()
922+
key = "metrics/runner/{rid}/{y}/{m:02d}/{d:02d}/{H:02d}{M:02d}{S:02d}.pb.snappy".format(
923+
rid=RUNNER_ID,
924+
y=now.tm_year, m=now.tm_mon, d=now.tm_mday,
925+
H=now.tm_hour, M=now.tm_min, S=now.tm_sec,
926+
)
927+
dst = "gs://{}/{}".format(BUCKET, key)
928+
929+
try:
930+
proc = subprocess.run(
931+
["gcloud", "storage", "cp", "-", dst],
932+
input=body, capture_output=True, timeout=30,
933+
)
934+
if proc.returncode != 0:
935+
sys.stderr.write("gcloud cp failed: {}\n".format(proc.stderr.decode()))
936+
self.send_response(502)
937+
self.end_headers()
938+
return
939+
except Exception as e:
940+
sys.stderr.write("audit write error: {}\n".format(e))
941+
self.send_response(502)
942+
self.end_headers()
943+
return
944+
945+
self.send_response(204)
946+
self.end_headers()
947+
948+
def log_message(self, fmt, *args):
949+
pass # suppress per-request access logs
950+
951+
if not BUCKET or not RUNNER_ID:
952+
sys.stderr.write("RUNNER_ASSETS_BUCKET_NAME or RUNNER_ID not set, exiting\n")
953+
sys.exit(1)
954+
955+
server = HTTPServer((LISTEN_ADDR, LISTEN_PORT), AuditHandler)
956+
sys.stderr.write("metrics-audit-receiver listening on {}:{}\n".format(LISTEN_ADDR, LISTEN_PORT))
957+
server.serve_forever()
958+
959+
# Systemd service for the metrics audit receiver
960+
- path: /var/lib/systemd/system/metrics-audit-receiver.service
961+
permissions: '0644'
962+
content: |
963+
[Unit]
964+
Description=Metrics Audit Receiver
965+
After=network.target
966+
Before=prometheus.service
967+
968+
[Service]
969+
Type=simple
970+
Restart=always
971+
RestartSec=5s
972+
EnvironmentFile=/var/lib/gitpod/runner.env
973+
ExecStart=/var/lib/gitpod/metrics-audit-receiver.py
974+
975+
[Install]
976+
WantedBy=multi-user.target
977+
864978
# Enhanced startup script with better error handling and validation
865979
- path: /tmp/container-startup.sh
866980
permissions: '0755'
@@ -999,6 +1113,7 @@ write_files:
9991113
"node-exporter.service"
10001114
"prometheus-config-updater.service"
10011115
"prometheus-config-updater.timer"
1116+
"metrics-audit-receiver.service"
10021117
"gitpod-auth-proxy.service"
10031118
"gitpod-runner.service"
10041119
)

0 commit comments

Comments
 (0)