diff --git a/.github/workflows/e2e-test-gcp-runner.yml b/.github/workflows/e2e-test-gcp-runner.yml deleted file mode 100644 index 1029495..0000000 --- a/.github/workflows/e2e-test-gcp-runner.yml +++ /dev/null @@ -1,89 +0,0 @@ -name: GCP Runner E2E Tests - -on: - schedule: - # Run daily at 6 AM UTC - - cron: "0 6 * * *" - workflow_dispatch: - -permissions: - contents: read - -defaults: - run: - shell: bash - -env: - # GCP Configuration - Override with variables if provided - GCP_PROJECT_ID: ${{ vars.E2E_GCP_PROJECT_ID || 'gitpod-gcp-runner-e2e-tests' }} - GCP_REGION: ${{ vars.E2E_GCP_REGION || 'us-central1' }} - GITPOD_API_ENDPOINT: ${{ vars.E2E_GITPOD_API_ENDPOINT || 'https://app.gitpod.io/api' }} - -jobs: - e2e-test: - name: Run E2E Test - runs-on: ubuntu-latest - timeout-minutes: 60 - - steps: - - name: Checkout Repository - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Set up Cloud SDK - uses: google-github-actions/setup-gcloud@v2 - with: - version: 'latest' - - - name: Set up Terraform - uses: hashicorp/setup-terraform@v3 - with: - terraform_version: "latest" - - - name: Configure environment and authentication - run: | - # Set sensitive environment variables for e2e script - echo "GITPOD_TOKEN=${{ secrets.E2E_GITPOD_TOKEN }}" >> $GITHUB_ENV - - # Set up service account authentication directly (like dev container) - echo '${{ secrets.E2E_GOOGLE_APPLICATION_CREDENTIALS }}' > /tmp/gcp-sa-key.json - chmod 600 /tmp/gcp-sa-key.json - echo "GOOGLE_APPLICATION_CREDENTIALS=/tmp/gcp-sa-key.json" >> $GITHUB_ENV - - - name: Run E2E Test - run: | - echo "Starting GCP Runner E2E test..." - echo "Project: $GCP_PROJECT_ID | Region: $GCP_REGION" - - # Run the E2E test script - chmod +x tests/e2e/scripts/e2e-test.sh - ./tests/e2e/scripts/e2e-test.sh - - - name: Cleanup service account file - if: always() - run: | - # Clean up temporary service account key file - if [[ -f "/tmp/gcp-sa-key.json" ]]; then - rm -f /tmp/gcp-sa-key.json - fi - - - name: Slack Notification on Failure - if: failure() && github.ref == 'refs/heads/main' - uses: rtCamp/action-slack-notify@v2.3.3 - env: - SLACK_WEBHOOK: ${{ secrets.NEXT_ALERTS_SLACK_WEBHOOK }} - SLACK_ICON_EMOJI: ":gcp:" - SLACK_USERNAME: "GCP Runner E2E Tests" - SLACK_COLOR: "danger" - SLACK_MESSAGE: | - :warning: GCP Runner end-to-end tests have failed - - **Details:** - • Project: ${{ env.GCP_PROJECT_ID }} - • Region: ${{ env.GCP_REGION }} - • Workflow: ${{ github.workflow }} - • Commit: ${{ github.sha }} - - The test creates a runner via RunnerService API, deploys infrastructure with Terraform, and verifies the runner comes online. - SLACK_FOOTER: "" diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 8460987..7e6aebe 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -61,13 +61,13 @@ jobs: id: changelog run: | if [ -n "${{ steps.prev_tag.outputs.tag }}" ]; then - # Get commits between tags, excluding automated image update commits - CHANGELOG=$(git log --pretty=format:"- %s (%h)" \ + # Get non-merge commits between tags, excluding automated image update commits + CHANGELOG=$(git log --no-merges --pretty=format:"- %s (%h)" \ "${{ steps.prev_tag.outputs.tag }}..${RELEASE_TAG}" \ --grep="Update GCP runner, proxy, prometheus, and node-exporter images" --invert-grep) else - # First release - get all commits excluding automated ones - CHANGELOG=$(git log --pretty=format:"- %s (%h)" \ + # First release - get all non-merge commits excluding automated ones + CHANGELOG=$(git log --no-merges --pretty=format:"- %s (%h)" \ --grep="Update GCP runner, proxy, prometheus, and node-exporter images" --invert-grep) fi @@ -91,8 +91,8 @@ jobs: if [ -n "$CHANGED_FILES" ]; then echo "has_changes=true" >> $GITHUB_OUTPUT - # Get commits that touched IAM files - IAM_COMMITS=$(git log --pretty=format:"- %s (%h)" \ + # Get non-merge commits that touched IAM files + IAM_COMMITS=$(git log --no-merges --pretty=format:"- %s (%h)" \ "${{ steps.prev_tag.outputs.tag }}..${RELEASE_TAG}" \ --grep="Update GCP runner, proxy, prometheus, and node-exporter images" --invert-grep \ -- $IAM_FILES) @@ -124,7 +124,12 @@ jobs: - name: Build release body id: body run: | - cat << 'EOF' > release_body.md + # Derive docs anchor from manifest version: 20260508.526 -> 20260508-526 + DOCS_ANCHOR=$(echo "${{ steps.manifest.outputs.version }}" | tr '.' '-') + + cat << EOF > release_body.md + For application changes in the GCP runner itself, see the [release notes](https://ona.com/docs/release-notes/gcp-runner#${DOCS_ANCHOR}). + ## Container Images | Component | Image | diff --git a/CHANGELOG.md b/CHANGELOG.md deleted file mode 100644 index b5327be..0000000 --- a/CHANGELOG.md +++ /dev/null @@ -1,14 +0,0 @@ -# Changelog - -All notable changes to this project will be documented in this file. - -The format is based on -[Keep a Changelog](https://keepachangelog.com/en/1.0.0/), -and this project adheres to -[Semantic Versioning](https://semver.org/spec/v2.0.0.html). - -## 1.0.0 - -### Added - -- Initial release diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..13f2a6b --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,62 @@ +# Contributing + +[![Build with Ona](https://gitpod.io/button/open-in-gitpod.svg)](https://gitpod.io/#https://github.com/gitpod-io/terraform-google-ona-runner) + +This document provides guidelines for contributing to the Ona GCP Runner Terraform module. + +## Development Environment + +The easiest way to get started is to open this repository in [Ona](https://ona.com/) or run the included [dev container](.devcontainer/) locally with [VS Code Dev Containers](https://code.visualstudio.com/docs/devcontainers/containers) or any compatible IDE. The dev container comes pre-configured with all required tools. + +If you prefer a manual setup, install the following: + +- [Terraform](https://terraform.io/) >= 1.0 +- [Google Cloud SDK](https://cloud.google.com/sdk/install) +- [pre-commit](https://pre-commit.com/) +- [terraform-docs](https://github.com/terraform-docs/terraform-docs) + +## File Structure + +| Path | Description | +|---|---| +| `*.tf` | Root module resources | +| `variables.tf` | Input variables | +| `outputs.tf` | Output values | +| `versions.tf` | Provider and Terraform version constraints | +| `modules/` | Submodules | +| `examples/` | Example configurations | +| `docs/` | Additional documentation | +| `files/` | Template files used by resources | + +## Making Changes + +1. Fork the repository and create a feature branch. +2. Make your changes, following the conventions below. +3. Run linting and formatting checks. +4. Submit a pull request against `main`. + +### Linting and Formatting + +This repository uses [pre-commit](https://pre-commit.com/) hooks for `terraform fmt`, `terraform-docs`, `shellcheck`, and general file hygiene. Install the hooks once after cloning: + +```bash +pre-commit install +``` + +To run all checks manually: + +```bash +pre-commit run --all-files +``` + +### Generating Documentation + +Input and output tables in README files are generated automatically by `terraform-docs` via pre-commit. If you change `variables.tf` or `outputs.tf`, the tables will be updated on your next commit. You can also regenerate them manually: + +```bash +pre-commit run terraform_docs --all-files +``` + +## License + +By contributing, you agree that your contributions will be licensed under the [Mozilla Public License 2.0](LICENSE). diff --git a/README.md b/README.md index 255df0c..fa0b9de 100644 --- a/README.md +++ b/README.md @@ -1,253 +1,35 @@ # Ona GCP Runner -This Terraform module deploys an Ona runner on Google Cloud Platform (GCP). - -## Prerequisites - -1. **GCP Project**: A GCP project with billing enabled -2. **Existing Infrastructure**: VPC, subnet, and SSL certificate -3. **SSL Certificate**: A managed certificate in Certificate Manager for your domain -4. **Terraform**: Version >= 1.3 -5. **GCP CLI**: For authentication and project setup - -## Quick Start - -1. **Clone and configure**: - ```bash - git clone - cd - cp terraform.tfvars.example terraform.tfvars - ``` - -2. **Edit `terraform.tfvars`** with your values: - ```hcl - project_id = "your-gcp-project-id" - region = "us-central1" - zones = ["us-central1-a", "us-central1-b", "us-central1-c"] - runner_name = "my-ona-runner" - runner_id = "your-runner-id" # From Ona dashboard - runner_token = "your-runner-token" # From Ona dashboard - runner_domain = "ona.example.com" - vpc_name = "your-existing-vpc" # Existing VPC name - runner_subnet_name = "your-existing-subnet" # Existing subnet name - certificate_id = "projects/your-project/locations/global/certificates/your-cert" # Certificate Manager resource ID - - # Optional: Proxy configuration - proxy_config = { - http_proxy = "http://proxy.example.com:8080" - https_proxy = "http://proxy.example.com:8080" - all_proxy = "http://proxy.example.com:8080" - no_proxy = "localhost,127.0.0.1,metadata.google.internal" - } - - # Optional: Custom CA certificate (choose one method) - ca_certificate = { - file_path = "/path/to/ca-certificate.pem" # OR - content = "-----BEGIN CERTIFICATE-----\n..." - } - - # Optional: Use pre-created service accounts - pre_created_service_accounts = { - runner = "my-runner@my-project.iam.gserviceaccount.com" - environment_vm = "my-env-vm@my-project.iam.gserviceaccount.com" - build_cache = "my-build-cache@my-project.iam.gserviceaccount.com" - secret_manager = "my-secrets@my-project.iam.gserviceaccount.com" - pubsub_processor = "my-pubsub@my-project.iam.gserviceaccount.com" - proxy_vm = "my-proxy-vm@my-project.iam.gserviceaccount.com" - } - ``` - -3. **Deploy**: - ```bash - terraform init - terraform plan - terraform apply - ``` - -## Pre-Created Service Accounts - -By default, the module creates 6 service accounts with minimal permissions. If your organization requires pre-created service accounts, you can provide them: - -```hcl -pre_created_service_accounts = { - runner = "my-runner@my-project.iam.gserviceaccount.com" - environment_vm = "my-env-vm@my-project.iam.gserviceaccount.com" - build_cache = "my-build-cache@my-project.iam.gserviceaccount.com" - secret_manager = "my-secrets@my-project.iam.gserviceaccount.com" - pubsub_processor = "my-pubsub@my-project.iam.gserviceaccount.com" - proxy_vm = "my-proxy-vm@my-project.iam.gserviceaccount.com" -} -``` - -**Important**: When using pre-created service accounts: -- You must create the required custom IAM roles manually -- You must assign the proper permissions to each service account -- See [IAM Documentation](./docs/iam.md) for complete details - -**Partial Configuration**: You can provide some service accounts and let Terraform create others: -```hcl -pre_created_service_accounts = { - runner = "existing-runner@my-project.iam.gserviceaccount.com" - # Others will be created by Terraform (leave empty or omit) -} -``` - -## Customer-Managed Encryption Keys (CMEK) - -If your organization requires CMEK encryption for compliance with organizational policies like `constraints/gcp.restrictNonCmekServices`, see the [CMEK Setup Guide](./docs/cmek-setup.md). - -**Automatic setup** (recommended): -```hcl -# Add to terraform.tfvars: -create_cmek = true -``` - -**Manual setup**: -```hcl -# Create KMS key manually (see docs/cmek-setup.md), then: -create_cmek = false -kms_key_name = "projects/your-project/locations/us-central1/keyRings/ona-keyring/cryptoKeys/ona-key" -``` - -## Module Architecture - -This module creates the Ona runner infrastructure using your existing VPC and certificate: - -- **Load Balancer**: Global HTTPS load balancer with SSL termination -- **Compute**: Auto-scaling VM instances for runner and proxy services -- **Security**: IAM roles, service accounts, and network security - -## Runner with Networking Example - -For a full infrastructure setup including VPC, DNS, and certificates, see the [runner-with-networking example](./examples/runner-with-networking/). - -## Configuration - -### Required Variables - -| Variable | Description | Example | -|----------|-------------|---------| -| `project_id` | GCP project ID | `"my-project-123"` | -| `region` | GCP region | `"us-central1"` | -| `zones` | List of zones | `["us-central1-a", "us-central1-b"]` | -| `runner_name` | Runner identifier | `"my-runner"` | -| `runner_id` | Ona runner ID | `"runner-abc123"` | -| `runner_token` | Runner auth token | `"token-xyz789"` | -| `runner_domain` | Domain for the runner | `"gitpod.example.com"` | -| `vpc_name` | Existing VPC name | `"my-vpc"` | -| `runner_subnet_name` | Existing subnet name | `"my-subnet"` | -| `certificate_id` | Certificate resource ID | `"projects/.../certificates/..."` | - -### Optional Variables - -| Variable | Description | Default | -|----------|-------------|---------| -| `api_endpoint` | Ona API endpoint | `"https://app.gitpod.io/api"` | -| `ssh_port` | SSH port for environments | `29222` | -| `development_version` | Development build version | `""` | -| `labels` | Labels to apply to resources | `{}` | -| `proxy_config` | HTTP/HTTPS proxy configuration | `null` | - - -### Internal Load Balancer - -To use an internal load balancer instead of the default external load balancer: - -```hcl -loadbalancer_type = "internal" -routable_subnet_name = "your-routable-subnet" -certificate_secret_id = "projects/your-project/secrets/your-cert-secret" -``` - -**Requirements:** -- `routable_subnet_name`: Subnet where the load balancer IP will be allocated -- `certificate_secret_id`: Secret Manager secret containing certificate data in JSON format: - ```json - { - "certificate": "-----BEGIN CERTIFICATE-----...", - "privateKey": "-----BEGIN PRIVATE KEY-----..." - } - ``` -- VPC must include a subnet with purpose `REGIONAL_MANAGED_PROXY` for the proxy service - -## Examples - -- **[Runner with Networking](./examples/runner-with-networking/)**: Full setup with VPC, DNS, and certificates - -## Monitoring - -The module includes: -- **Prometheus**: Metrics collection on port 9090 -- **Health Checks**: Automated health monitoring -- **Logging**: Centralized logging to Cloud Logging - -## Security - -- All VMs use minimal IAM permissions -- Network traffic is restricted by firewall rules -- SSL/TLS encryption for all external traffic -- Secrets stored in Secret Manager -- CA certificates stored securely in GCS with controlled access - -## Release Notifications - -Ona publishes Pub/Sub messages when new stable GCP runner releases are available. You can subscribe from your own GCP project to receive notifications instead of polling. - -See the [Release Notifications Guide](./docs/release-notifications.md) for topic details, message format, and Terraform/gcloud subscription examples. - -## CA Certificate Management - -The module supports custom CA certificates for proxy environments: - -### Configuration Options - -1. **File-based approach** (recommended for CI/CD): - ```hcl - ca_certificate = { - file_path = "/path/to/ca-certificate.pem" - content = "" - } - ``` - -2. **Direct content approach**: - ```hcl - ca_certificate = { - file_path = "" - content = "-----BEGIN CERTIFICATE-----\n...\n-----END CERTIFICATE-----" - } - ``` - -### How it works +[![Build with Ona](https://gitpod.io/button/open-in-gitpod.svg)](https://gitpod.io/#https://github.com/gitpod-io/terraform-google-ona-runner) -1. **Upload**: CA certificate is uploaded to a dedicated GCS bucket during terraform apply -2. **Download**: VMs download the CA certificate from GCS during startup -3. **Usage**: CA certificate is used by Docker daemon and other tools requiring custom trust -4. **Security**: Access to CA bucket is restricted via IAM to only runner and proxy VMs +This is the Terraform module for the Ona GCP Runner. It deploys an +[Ona](https://ona.com) runner in your Google Cloud VPC, where each development +environment runs as a Compute Engine instance inside your project — source code +and credentials never leave your infrastructure. -## Troubleshooting +> GCP Runners require an [Enterprise plan](https://ona.com/pricing). +> To get access, [contact our sales](https://ona.com/contact/sales). -### Common Issues +Refer to [the Ona documentation](https://ona.com/docs/ona/runners/gcp/overview) +for setup instructions, configuration options, and troubleshooting. -1. **DNS not resolving**: Check domain configuration -2. **Certificate errors**: Ensure certificate is valid and accessible -3. **VM startup failures**: Check Cloud Logging for detailed error messages +--- -### Debugging +

+ GCP Runner architecture +

-```bash -# Check VM logs -gcloud logging read "resource.type=gce_instance" --limit=50 +--- -# Check load balancer health -gcloud compute backend-services get-health -``` +## Example -## Contributing +The [`runner-with-networking`](./examples/runner-with-networking/) example +provides a full infrastructure setup including VPC, DNS, and certificates. -[![Build with Ona](https://gitpod.io/button/open-in-gitpod.svg)](https://gitpod.io/#https://github.com/gitpod-io/terraform-google-ona-runner) +## Releases -1. Fork the repository -2. Create a feature branch -3. Make your changes -4. Test thoroughly -5. Submit a pull request +New stable releases are published roughly once a week. To get notified when a +release is available, subscribe to the Pub/Sub release notifications topic from +your own GCP project. See the +[Release Notifications](https://ona.com/docs/ona/runners/gcp/update-runner#release-notifications) +documentation for topic details, message format, and subscription examples. diff --git a/VERSION b/VERSION new file mode 100644 index 0000000..38f77a6 --- /dev/null +++ b/VERSION @@ -0,0 +1 @@ +2.0.1 diff --git a/build_cache.tf b/build_cache.tf index b0fa868..e432821 100644 --- a/build_cache.tf +++ b/build_cache.tf @@ -53,15 +53,3 @@ resource "google_storage_bucket" "build_cache" { purpose = "buildkit-cache" }) } - -# IAM binding for dedicated build cache service account -resource "google_storage_bucket_iam_member" "cache_access" { - bucket = google_storage_bucket.build_cache.name - role = "roles/storage.objectAdmin" - member = "serviceAccount:${local.build_cache_sa_email}" - - depends_on = [google_storage_bucket.build_cache] -} - -# Note: Required APIs (storage.googleapis.com and iamcredentials.googleapis.com) -# are enabled by the main services module diff --git a/docs/detailed_iam_reference.md b/docs/detailed_iam_reference.md index 562afd0..1207d3a 100644 --- a/docs/detailed_iam_reference.md +++ b/docs/detailed_iam_reference.md @@ -8,13 +8,13 @@ The module creates a secure, least-privilege infrastructure for running Ona work ## Service Accounts -**Architecture Rationale**: Ona uses multiple specialized service accounts instead of a single account to implement defense-in-depth security: +The module creates 3 service accounts: -- **Blast Radius Limitation**: If one service account is compromised, damage is limited to its specific function -- **Principle of Least Privilege**: Each account has only the minimum permissions needed for its role -- **Audit Granularity**: Security events can be traced to specific functions (e.g., secret access vs. VM creation) -- **Operational Isolation**: Different operational concerns (compute, storage, secrets) are separated -- **Compliance**: Easier to demonstrate security controls to auditors with clear separation of duties +1. **Runner** (`runner`) — manages infrastructure and orchestrates workspace lifecycle +2. **Environment VM** (`environment_vm`) — used by workspace VMs with minimal permissions +3. **Proxy VM** (`proxy_vm`) — used by proxy VMs for load balancing and traffic routing + +> **Note**: Previous versions created additional service accounts for build cache, secret management, and pub/sub processing. These have been removed — their permissions are handled directly by the runner service account. ### 1. Runner Service Account (`runner`) **Purpose**: Manages the runner infrastructure and orchestrates workspace lifecycle @@ -65,59 +65,7 @@ The module creates a secure, least-privilege infrastructure for running Ona work **Security Rationale**: Workspace VMs have minimal permissions - only metric writing, logging, and reading container images. No access to other workspaces, secrets, or infrastructure management. This limits blast radius if a workspace is compromised. -### 3. Build Cache Service Account (`build_cache`) -**Purpose**: Manages build cache storage and access - -**Display Name**: Ona Build Cache -**Account ID**: `{runner_name}-build-cache` -**Description**: Service account for GCS build cache operations - -**OAuth Scopes**: None (uses IAM permissions only) - -**IAM Roles**: -- `roles/storage.objectAdmin` - Manage build cache objects in Cloud Storage (upload, download, delete cached build artifacts and container layers) -- `roles/logging.logWriter` - Write logs to Cloud Logging for cache operation auditing - -**Security Rationale**: Isolated service account prevents workspace VMs from accessing build cache directly. Runner impersonates this account only when cache operations are needed, providing controlled access and audit trail. - -### 4. Secret Manager Service Account (`secret_manager`) -**Purpose**: Manages workspace secrets and environment variables - -**Display Name**: Ona Secret Manager -**Account ID**: `{runner_name}-secrets` -**Description**: Service account for environment secret management - -**OAuth Scopes**: None (uses IAM permissions only) - -**IAM Roles**: -- **Custom Role**: `{runner_name}_secret_manager` - Scoped permissions for secret management: - - `secretmanager.secrets.create` - Create new workspace secrets - - `secretmanager.secrets.delete` - Delete unused secrets - - `secretmanager.secrets.get` - Get secret metadata - - `secretmanager.secrets.list` - List secrets for management - - `secretmanager.versions.access` - Access secret values - - `secretmanager.versions.add` - Add new secret versions - - `secretmanager.versions.destroy` - Delete secret versions -- `roles/logging.logWriter` - Write logs to Cloud Logging for secret operation auditing - -**Security Rationale**: Dedicated service account for secret operations with minimal custom permissions (no IAM policy management). Secrets are isolated per workspace. Runner impersonates this account only during secret operations, ensuring secrets are never directly accessible to workspace VMs. - -### 5. Pub/Sub Processor Service Account (`pubsub_processor`) -**Purpose**: Processes Pub/Sub messages for event-driven operations - -**Display Name**: Ona Pub/Sub Processor -**Account ID**: `{runner_name}-pubsub` -**Description**: Service account for processing Pub/Sub compute events - -**OAuth Scopes**: None (uses IAM permissions only) - -**IAM Roles**: -- `roles/monitoring.metricWriter` - Write processing metrics for monitoring (message processing rates, queue depths, error rates) -- `roles/logging.logWriter` - Write logs to Cloud Logging for event processing auditing - -**Security Rationale**: Separate service account for async message processing provides isolation from synchronous operations. Limited to metric writing and logging only - no access to compute resources or secrets. The runner service account can impersonate this account for event processing operations. - -### 6. Proxy VM Service Account (`proxy_vm`) +### 3. Proxy VM Service Account (`proxy_vm`) **Purpose**: Used by proxy VMs for load balancing and traffic routing **Display Name**: Ona Proxy VM Service @@ -262,43 +210,21 @@ The runner service account uses a custom IAM role with minimal required permissi ### IAM Permissions -**Service Account Management**: -- `iam.serviceAccounts.actAs` - Impersonate other service accounts +**Service Account Management** (project-level on the runner custom role): - `iam.serviceAccounts.getIamPolicy` - Get service account IAM policies - `iam.serviceAccounts.setIamPolicy` - Set service account IAM policies -- `iam.serviceAccounts.getAccessToken` - Generate access tokens - - -## Service Account Impersonation and Usage - -**Security Pattern**: The runner uses service account impersonation and controlled usage instead of direct permissions to enhance security and auditability. - -**Why Impersonation/Usage Instead of Direct Permissions**: -- **Temporal Access Control**: Tokens are generated only when needed and have short lifespans (1 hour max) -- **Audit Trail**: Every impersonation/usage event is logged, providing clear audit trail of when and why privileged operations occurred -- **Credential Rotation**: No long-lived credentials stored on VMs - tokens are generated on-demand -- **Scope Limitation**: Each impersonated token has only the permissions of the target service account -- **Revocation**: Impersonation can be revoked instantly by removing TokenCreator/User roles -The runner service account can impersonate or use other service accounts for specific operations: +**Per-SA bindings** (granted via `google_service_account_iam_member` on +specific SAs only): +- `roles/iam.serviceAccountUser` (i.e. `iam.serviceAccounts.actAs`) on + the runner, environment_vm, and proxy_vm service accounts. Required to + attach those SAs to the instances and instance templates the runner + creates. -### Build Cache Token Generation -- **Target**: `build_cache` service account -- **Role**: `roles/iam.serviceAccountTokenCreator` -- **Purpose**: Generate short-lived tokens for build cache operations (upload/download artifacts) -- **Security Benefit**: Build cache access is logged and time-limited -### Secret Manager Token Generation -- **Target**: `secret_manager` service account -- **Role**: `roles/iam.serviceAccountTokenCreator` -- **Purpose**: Generate short-lived tokens for secret management operations (create/read workspace secrets) -- **Security Benefit**: Secret access is logged and time-limited, preventing credential theft +## Runner Direct Permissions -### Pub/Sub Processor Service Account Usage -- **Target**: `pubsub_processor` service account -- **Role**: `roles/iam.serviceAccountUser` -- **Purpose**: Use the pub/sub processor service account for event handling operations -- **Security Benefit**: Event processing is isolated and auditable through separate service account +The runner service account directly holds permissions for build cache (GCS), secret management, and pub/sub event processing. These were previously delegated to separate service accounts via impersonation but are now consolidated on the runner SA for simplicity. ## Audit Logging Configuration diff --git a/docs/iam.md b/docs/iam.md index 9ac10dd..e0f0a70 100644 --- a/docs/iam.md +++ b/docs/iam.md @@ -93,7 +93,7 @@ When using pre-created service accounts, these roles must be created beforehand - **Title**: Ona Runner - **Description**: Minimal permissions for runner infrastructure management -**Permissions** (68 total): +**Permissions** (75 total): ``` # Instance lifecycle management compute.instances.create @@ -102,6 +102,7 @@ compute.instances.get compute.instances.list compute.instances.start compute.instances.stop +compute.instances.resume compute.instances.setLabels compute.instances.setMetadata compute.instances.setTags @@ -183,10 +184,11 @@ pubsub.topics.get pubsub.topics.list # IAM (service account management) -iam.serviceAccounts.actAs +# actAs (roles/iam.serviceAccountUser) is granted per-SA on the runner, +# environment_vm, and proxy_vm SAs via google_service_account_iam_member +# resources, scoped to those SAs only. iam.serviceAccounts.getIamPolicy iam.serviceAccounts.setIamPolicy -iam.serviceAccounts.getAccessToken # Instance templates and groups compute.instanceTemplates.create @@ -201,32 +203,26 @@ compute.instanceGroupManagers.list compute.instanceGroupManagers.create compute.instanceGroupManagers.delete compute.instanceGroupManagers.update +compute.instanceGroupManagers.use compute.instanceGroups.delete compute.instanceGroups.list +# Autoscaler (dynamic warm pool scaling) +compute.autoscalers.create +compute.autoscalers.delete +compute.autoscalers.get +compute.autoscalers.update + +# Cloud Monitoring (warm pool scaling metrics) +monitoring.timeSeries.create + # Cloud Logging (prebuild log persistence) logging.logEntries.list logging.logEntries.create logging.logs.delete ``` -### 2. Secret Manager Custom Role -- **Role ID**: `{runner_name_underscore}_secret_manager` (e.g., `gcp_2_secret_manager`) -- **Title**: Ona Secret Manager -- **Description**: Scoped permissions for environment secret management - -**Permissions** (7 total): -``` -secretmanager.secrets.create -secretmanager.secrets.delete -secretmanager.secrets.get -secretmanager.secrets.list -secretmanager.versions.access -secretmanager.versions.add -secretmanager.versions.destroy -``` - -### 3. Proxy VM Custom Role +### 2. Proxy VM Custom Role - **Role ID**: `{runner_name_underscore}_proxy_vm` (e.g., `gcp_2_proxy_vm`) - **Title**: Ona Proxy VM Minimal - **Description**: Minimal permissions for Ona proxy VM instances @@ -312,9 +308,8 @@ If not pre-created, the module will create the following service accounts: - `roles/secretmanager.secretAccessor` on runner token secret - `roles/secretmanager.secretAccessor` on metrics configuration secret - `roles/secretmanager.secretVersionManager` - Manage secret versions -- `roles/storage.objectViewer` on runner assets bucket -- `roles/iam.serviceAccountTokenCreator` on build cache service account -- `roles/iam.serviceAccountTokenCreator` on secret manager service account +- `roles/storage.objectAdmin` on runner assets bucket +- `roles/storage.objectAdmin` on build cache bucket - `roles/pubsub.subscriber` on compute events subscription - `roles/pubsub.viewer` on dead letter subscription - `roles/cloudkms.cryptoKeyEncrypterDecrypter` on KMS key (if CMEK is enabled) @@ -335,56 +330,7 @@ If not pre-created, the module will create the following service accounts: **Resource-Specific Access**: - `roles/cloudkms.cryptoKeyEncrypterDecrypter` on KMS key (if CMEK is enabled) -### 3. Build Cache Service Account -- **Name**: `{runner_name}-build-cache` (e.g., `gcp-2-build-cache`) -- **Display Name**: Ona Build Cache -- **Purpose**: GCS build cache operations -- **Used By**: BuildKit for container image caching - -**Custom Roles**: None - -**Predefined Roles**: -- `roles/logging.logWriter` - Write logs - -**Resource-Specific Access**: -- `roles/storage.objectAdmin` on build cache bucket -- `roles/cloudkms.cryptoKeyEncrypterDecrypter` on KMS key (if CMEK is enabled) - -### 4. Secret Manager Service Account -- **Name**: `{runner_name}-secrets` (e.g., `gcp-2-secrets`) -- **Display Name**: Ona Secret Manager -- **Purpose**: Environment-specific secret management -- **Used By**: Runner for managing user secrets - -**Custom Roles**: None (uses custom role #2 via impersonation) - -**Predefined Roles**: -- `roles/logging.logWriter` - Write logs - -**Resource-Specific Access**: -- `roles/cloudkms.cryptoKeyEncrypterDecrypter` on KMS key (if CMEK is enabled) - -**Note**: This service account is used via impersonation by the Runner service account - -### 5. Pub/Sub Processor Service Account -- **Name**: `{runner_name}-pubsub` (e.g., `gcp-2-pubsub`) -- **Display Name**: Ona Pub/Sub Processor -- **Purpose**: Event-driven reconciliation -- **Used By**: Event processing workflows - -**Custom Roles**: None - -**Predefined Roles**: -- `roles/logging.logWriter` - Write logs -- `roles/monitoring.metricWriter` - Write metrics - -**Resource-Specific Access**: -- `roles/cloudkms.cryptoKeyEncrypterDecrypter` on KMS key (if CMEK is enabled) - -**Service Account Usage**: -- Runner can use this service account (`roles/iam.serviceAccountUser`) - -### 6. Proxy VM Service Account +### 3. Proxy VM Service Account - **Name**: `{runner_name}-proxy-vm` (e.g., `gcp-2-proxy-vm`) - **Display Name**: Ona Proxy VM Service - **Purpose**: Minimal permissions for proxy functionality @@ -432,25 +378,7 @@ gcloud iam service-accounts create ${RUNNER_NAME}-env-vm \ --description="Minimal service account for environment VMs" \ --project=${PROJECT_ID} -# 3. Build cache service account -gcloud iam service-accounts create ${RUNNER_NAME}-build-cache \ - --display-name="Ona Build Cache" \ - --description="Service account for GCS build cache operations" \ - --project=${PROJECT_ID} - -# 4. Secret manager service account -gcloud iam service-accounts create ${RUNNER_NAME}-secrets \ - --display-name="Ona Secret Manager" \ - --description="Service account for environment secret management" \ - --project=${PROJECT_ID} - -# 5. Pub/Sub processor service account -gcloud iam service-accounts create ${RUNNER_NAME}-pubsub \ - --display-name="Ona Pub/Sub Processor" \ - --description="Service account for processing Pub/Sub compute events" \ - --project=${PROJECT_ID} - -# 6. Proxy VM service account +# 3. Proxy VM service account gcloud iam service-accounts create ${RUNNER_NAME}-proxy-vm \ --display-name="Ona Proxy VM Service" \ --description="Service account for Ona proxy VM instances" \ @@ -475,6 +403,7 @@ includedPermissions: - compute.instances.list - compute.instances.start - compute.instances.stop +- compute.instances.resume - compute.instances.setLabels - compute.instances.setMetadata - compute.instances.setTags @@ -535,10 +464,12 @@ includedPermissions: - pubsub.subscriptions.consume - pubsub.topics.get - pubsub.topics.list -- iam.serviceAccounts.actAs - iam.serviceAccounts.getIamPolicy - iam.serviceAccounts.setIamPolicy -- iam.serviceAccounts.getAccessToken + - `iam.serviceAccounts.actAs` is granted per-SA on the runner, + environment_vm, and proxy_vm SAs via + `google_service_account_iam_member` resources, scoped to those SAs + only. - compute.instanceTemplates.create - compute.instanceTemplates.delete - compute.instanceTemplates.get @@ -551,8 +482,14 @@ includedPermissions: - compute.instanceGroupManagers.create - compute.instanceGroupManagers.delete - compute.instanceGroupManagers.update +- compute.instanceGroupManagers.use - compute.instanceGroups.delete - compute.instanceGroups.list +- compute.autoscalers.create +- compute.autoscalers.delete +- compute.autoscalers.get +- compute.autoscalers.update +- monitoring.timeSeries.create - logging.logEntries.list - logging.logEntries.create - logging.logs.delete @@ -562,26 +499,7 @@ gcloud iam roles create ${RUNNER_NAME_UNDERSCORE}_runner \ --project=${PROJECT_ID} \ --file=runner-role.yaml -# 2. Secret manager custom role -cat > secret-manager-role.yaml << EOF -title: "Ona Secret Manager" -description: "Scoped permissions for environment secret management" -stage: "GA" -includedPermissions: -- secretmanager.secrets.create -- secretmanager.secrets.delete -- secretmanager.secrets.get -- secretmanager.secrets.list -- secretmanager.versions.access -- secretmanager.versions.add -- secretmanager.versions.destroy -EOF - -gcloud iam roles create ${RUNNER_NAME_UNDERSCORE}_secret_manager \ - --project=${PROJECT_ID} \ - --file=secret-manager-role.yaml - -# 3. Proxy VM custom role +# 2. Proxy VM custom role cat > proxy-vm-role.yaml << EOF title: "Ona Proxy VM Minimal" description: "Minimal permissions for Ona proxy VM instances" @@ -601,7 +519,7 @@ gcloud iam roles create ${RUNNER_NAME_UNDERSCORE}_proxy_vm \ --file=proxy-vm-role.yaml # Clean up temporary files -rm -f runner-role.yaml secret-manager-role.yaml proxy-vm-role.yaml +rm -f runner-role.yaml proxy-vm-role.yaml ``` ### Assign Project-Level Permissions @@ -653,25 +571,6 @@ gcloud projects add-iam-policy-binding ${PROJECT_ID} \ --member="serviceAccount:${RUNNER_NAME}-env-vm@${PROJECT_ID}.iam.gserviceaccount.com" \ --role="roles/cloudkms.cryptoKeyEncrypterDecrypter" -# Build cache service account - logging only -gcloud projects add-iam-policy-binding ${PROJECT_ID} \ - --member="serviceAccount:${RUNNER_NAME}-build-cache@${PROJECT_ID}.iam.gserviceaccount.com" \ - --role="roles/logging.logWriter" - -# Secret manager service account - logging only -gcloud projects add-iam-policy-binding ${PROJECT_ID} \ - --member="serviceAccount:${RUNNER_NAME}-secrets@${PROJECT_ID}.iam.gserviceaccount.com" \ - --role="roles/logging.logWriter" - -# Pub/Sub processor service account - logging and monitoring -gcloud projects add-iam-policy-binding ${PROJECT_ID} \ - --member="serviceAccount:${RUNNER_NAME}-pubsub@${PROJECT_ID}.iam.gserviceaccount.com" \ - --role="roles/logging.logWriter" - -gcloud projects add-iam-policy-binding ${PROJECT_ID} \ - --member="serviceAccount:${RUNNER_NAME}-pubsub@${PROJECT_ID}.iam.gserviceaccount.com" \ - --role="roles/monitoring.metricWriter" - # Proxy VM service account - proxy functionality gcloud projects add-iam-policy-binding ${PROJECT_ID} \ --member="serviceAccount:${RUNNER_NAME}-proxy-vm@${PROJECT_ID}.iam.gserviceaccount.com" \ @@ -733,11 +632,6 @@ Project-level roles that need to be manually assigned via the GCP Console or `gc | **`${RUNNER_NAME}-env-vm`** | `roles/artifactregistry.reader` | | | | `roles/logging.logWriter` | | | | `roles/monitoring.metricWriter` | | -| **`${RUNNER_NAME}-build-cache`** | `roles/logging.logWriter` | | -| **`${RUNNER_NAME}-secrets`** | `roles/secretmanager.admin` | ★ Secret Manager | -| | `roles/logging.logWriter` | | -| **`${RUNNER_NAME}-pubsub`** | `roles/logging.logWriter` | | -| | `roles/monitoring.metricWriter` | | | **`${RUNNER_NAME}-proxy-vm`** | `roles/compute.viewer` | ★ Proxy VM | | | `roles/logging.logWriter` | | | | `roles/monitoring.metricWriter` | | @@ -752,11 +646,11 @@ Replace the single runner custom role binding with these 8 predefined roles: ```bash export SA="${RUNNER_NAME}-runner@${PROJECT_ID}.iam.gserviceaccount.com" -# Compute instance, disk, network, template, and MIG management. -# Needed: 37 compute permissions for VM lifecycle, disks, networks, operations, -# machine/disk types, instance templates, and MIG updates. -# Excess: grants 228 additional permissions including autoscaler, network endpoint -# group, and machine image management that the runner does not use. +# Compute instance, disk, network, template, MIG, and autoscaler management. +# Needed: 42 compute permissions for VM lifecycle, disks, networks, operations, +# machine/disk types, instance templates, MIG updates, and autoscaler scaling. +# Excess: grants 223 additional permissions including network endpoint +# group and machine image management that the runner does not use. gcloud projects add-iam-policy-binding ${PROJECT_ID} \ --member="serviceAccount:${SA}" \ --role="roles/compute.instanceAdmin" @@ -820,24 +714,6 @@ gcloud projects add-iam-policy-binding ${PROJECT_ID} \ --role="roles/iam.serviceAccountUser" ``` -### Secret Manager Custom Role → Predefined Role - -Replace the single secret manager custom role binding with 1 predefined role: - -```bash -export SA="${RUNNER_NAME}-secrets@${PROJECT_ID}.iam.gserviceaccount.com" - -# Secret lifecycle and version management. -# Needed: secrets.create, .delete, .get, .list, versions.access, .add, .destroy -# Excess: grants 20 additional permissions including secrets.setIamPolicy, -# secrets.getIamPolicy, secrets.update, versions.disable/.enable/.get/.list, -# and KMS-related permissions. -# Note: secrets.create and secrets.delete only exist in this admin role. -gcloud projects add-iam-policy-binding ${PROJECT_ID} \ - --member="serviceAccount:${SA}" \ - --role="roles/secretmanager.admin" -``` - ### Proxy VM Custom Role → Predefined Role Replace the single proxy VM custom role binding with 1 predefined role: diff --git a/docs/images/arch-diagram.png b/docs/images/arch-diagram.png new file mode 100644 index 0000000..2ac08a4 Binary files /dev/null and b/docs/images/arch-diagram.png differ diff --git a/docs/proxy.md b/docs/proxy.md deleted file mode 100644 index a75e9f5..0000000 --- a/docs/proxy.md +++ /dev/null @@ -1,568 +0,0 @@ -# Ona Runner Proxy Architecture - -## Overview - -The Ona Runner Proxy provides secure access to development environments by acting as a gateway/gatekeeper between users and their workspaces. Instead of relying on the centralized Ona gateway, this architecture deploys a local proxy with a load balancer frontend, offering better performance, customization, and control. - -## Architecture Context - -Based on the Enterprise Runner architecture, the proxy serves as a replacement for the gateway component: - -``` -┌─────────────────┐ ┌──────────────────────────────────────┐ -│ Customer │ │ AWS Account │ -│ Browser │────┤ │ -└─────────────────┘ │ ┌──────────────┐ ┌─────────────┐ │ - │ │ Load Balancer│───│ Proxy │ │ - │ │ (Public) │ │ (Gateway) │ │ - │ └──────────────┘ └─────────────┘ │ - │ │ │ │ - │ ┌─────────────────┐ │ │ - │ │ GKE/EKS │ │ │ - │ │ Cluster │───────┘ │ - │ │ │ │ - │ │ ┌─────────────┐ │ │ - │ │ │Environment 1│ │ │ - │ │ ┌─────────────┐ │ │ - │ │ │Environment 2│ │ │ - │ │ ┌─────────────┐ │ │ - │ │ │Environment 3│ │ │ - │ └─────────────────┘ │ - └──────────────────────────────────────┘ -``` - -## Configuration - -The proxy domain is configured via the `RunnerProxyDomain` field in the runner configuration: - -```go -// Proxy Server configuration (for SCM OAuth redirection) -RunnerProxyDomain string // Optional: Local proxy domain instead of using gateway -``` - -## Option 1: External Load Balancer with Public Access - -### Architecture -- **Load Balancer**: Google Cloud Load Balancer (HTTPS) -- **SSL Certificate**: Google-managed SSL certificate with wildcard domain -- **Access**: Public internet access with authentication -- **DNS**: Cloud DNS records pointing to load balancer IP - -### Components - -#### 1. Load Balancer Configuration -```hcl -# External Application Load Balancer -resource "google_compute_global_address" "proxy_ip" { - name = "${var.name_prefix}-proxy-ip" -} - -resource "google_compute_managed_ssl_certificate" "proxy_cert" { - name = "${var.name_prefix}-proxy-cert" - - managed { - domains = ["*.${var.proxy_domain}"] - } -} - -resource "google_compute_url_map" "proxy_lb" { - name = "${var.name_prefix}-proxy-lb" - default_service = google_compute_backend_service.proxy_backend.id -} - -resource "google_compute_target_https_proxy" "proxy_target" { - name = "${var.name_prefix}-proxy-target" - url_map = google_compute_url_map.proxy_lb.id - ssl_certificates = [google_compute_managed_ssl_certificate.proxy_cert.id] -} - -resource "google_compute_global_forwarding_rule" "proxy_forwarding_rule" { - name = "${var.name_prefix}-proxy-rule" - ip_protocol = "TCP" - load_balancing_scheme = "EXTERNAL" - port_range = "443" - target = google_compute_target_https_proxy.proxy_target.id - ip_address = google_compute_global_address.proxy_ip.id -} -``` - -#### 2. Proxy Service (Cloud Run) -```hcl -resource "google_cloud_run_v2_service" "proxy" { - name = "${var.name_prefix}-proxy" - location = var.region - - template { - service_account = google_service_account.proxy_sa.email - - containers { - image = var.proxy_image_url - - ports { - container_port = 8080 - } - - env { - name = "PROXY_DOMAIN" - value = var.proxy_domain - } - - env { - name = "RUNNER_ID" - value = var.runner_id - } - } - } - - traffic { - type = "TRAFFIC_TARGET_ALLOCATION_TYPE_LATEST" - percent = 100 - } -} -``` - -#### 3. Backend Service -```hcl -resource "google_compute_backend_service" "proxy_backend" { - name = "${var.name_prefix}-proxy-backend" - load_balancing_scheme = "EXTERNAL" - protocol = "HTTP" - - backend { - group = google_compute_region_network_endpoint_group.proxy_neg.id - } - - health_checks = [google_compute_health_check.proxy_health.id] -} - -resource "google_compute_region_network_endpoint_group" "proxy_neg" { - name = "${var.name_prefix}-proxy-neg" - network_endpoint_type = "SERVERLESS" - region = var.region - - cloud_run { - service = google_cloud_run_v2_service.proxy.name - } -} -``` - -#### 4. DNS Configuration -```hcl -resource "google_dns_managed_zone" "proxy_zone" { - name = "${var.name_prefix}-proxy-zone" - dns_name = "${var.proxy_domain}." -} - -resource "google_dns_record_set" "proxy_wildcard" { - name = "*.${google_dns_managed_zone.proxy_zone.dns_name}" - type = "A" - ttl = 300 - - managed_zone = google_dns_managed_zone.proxy_zone.name - - rrdatas = [google_compute_global_address.proxy_ip.address] -} - -resource "google_dns_record_set" "proxy_root" { - name = google_dns_managed_zone.proxy_zone.dns_name - type = "A" - ttl = 300 - - managed_zone = google_dns_managed_zone.proxy_zone.name - - rrdatas = [google_compute_global_address.proxy_ip.address] -} -``` - -### Pros -- **Simple setup**: Standard Google Cloud Load Balancer -- **Automatic SSL management**: Google-managed certificates -- **Global accessibility**: Can be accessed from anywhere -- **Auto-scaling**: Cloud Run handles scaling automatically - -### Cons -- **Public exposure**: Load balancer has public IP -- **Cost**: Higher cost due to global load balancer -- **Security**: Requires additional authentication layers - -## Option 2: Internal Load Balancer with Private Access - -### Architecture -- **Load Balancer**: Internal Application Load Balancer -- **SSL Certificate**: Google-managed certificate (requires DNS delegation) -- **Access**: Private network access only -- **DNS**: Customer DNS delegation to Google Cloud DNS - -### Components - -#### 1. Internal Load Balancer Configuration -```hcl -# Internal Application Load Balancer -resource "google_compute_address" "proxy_internal_ip" { - name = "${var.name_prefix}-proxy-internal-ip" - subnetwork = var.subnet_name - address_type = "INTERNAL" - region = var.region -} - -resource "google_compute_region_url_map" "proxy_internal_lb" { - name = "${var.name_prefix}-proxy-internal-lb" - region = var.region - default_service = google_compute_region_backend_service.proxy_internal_backend.id -} - -resource "google_compute_region_target_https_proxy" "proxy_internal_target" { - name = "${var.name_prefix}-proxy-internal-target" - region = var.region - url_map = google_compute_region_url_map.proxy_internal_lb.id - ssl_certificates = [google_compute_region_ssl_certificate.proxy_internal_cert.id] -} - -resource "google_compute_forwarding_rule" "proxy_internal_forwarding_rule" { - name = "${var.name_prefix}-proxy-internal-rule" - region = var.region - ip_protocol = "TCP" - load_balancing_scheme = "INTERNAL_MANAGED" - port_range = "443" - target = google_compute_region_target_https_proxy.proxy_internal_target.id - ip_address = google_compute_address.proxy_internal_ip.id - network = var.vpc_name - subnetwork = var.subnet_name -} -``` - -#### 2. Google-Managed SSL Certificate -```hcl -# DNS managed zone for internal domain -resource "google_dns_managed_zone" "proxy_internal_zone" { - name = "${var.name_prefix}-proxy-internal-zone" - dns_name = "${var.proxy_domain}." - description = "Internal DNS zone for proxy domain" - - visibility = "private" - - private_visibility_config { - networks { - network_url = var.vpc_id - } - } -} - -# Google-managed SSL certificate for internal use -resource "google_compute_region_ssl_certificate" "proxy_internal_cert" { - name_prefix = "${var.name_prefix}-proxy-internal-cert-" - region = var.region - - managed { - domains = ["*.${var.proxy_domain}"] - } - - lifecycle { - create_before_destroy = true - } -} -``` - -#### 3. DNS Records for Internal Access -```hcl -# Internal DNS records -resource "google_dns_record_set" "proxy_internal_wildcard" { - name = "*.${google_dns_managed_zone.proxy_internal_zone.dns_name}" - type = "A" - ttl = 300 - - managed_zone = google_dns_managed_zone.proxy_internal_zone.name - - rrdatas = [google_compute_address.proxy_internal_ip.address] -} - -resource "google_dns_record_set" "proxy_internal_root" { - name = google_dns_managed_zone.proxy_internal_zone.dns_name - type = "A" - ttl = 300 - - managed_zone = google_dns_managed_zone.proxy_internal_zone.name - - rrdatas = [google_compute_address.proxy_internal_ip.address] -} -``` - -#### 4. Internal Backend Service -```hcl -resource "google_compute_region_backend_service" "proxy_internal_backend" { - name = "${var.name_prefix}-proxy-internal-backend" - region = var.region - load_balancing_scheme = "INTERNAL_MANAGED" - protocol = "HTTP" - - backend { - group = google_compute_region_network_endpoint_group.proxy_internal_neg.id - } - - health_checks = [google_compute_region_health_check.proxy_internal_health.id] -} - -resource "google_compute_region_network_endpoint_group" "proxy_internal_neg" { - name = "${var.name_prefix}-proxy-internal-neg" - network_endpoint_type = "SERVERLESS" - region = var.region - - cloud_run { - service = google_cloud_run_v2_service.proxy.name - } -} -``` - - -### Pros -- **Enhanced security**: No public internet exposure -- **Lower cost**: Regional load balancer costs less -- **Network isolation**: Stays within private network -- **Compliance**: Better for regulated industries -- **Managed certificates**: Google-managed SSL certificates - -### Cons -- **DNS requirements**: Requires DNS delegation for certificate validation -- **Limited accessibility**: Only from within VPC or connected networks -- **Setup complexity**: More complex than external load balancer - - -## Terraform Implementation Plan - -### Module Structure -``` -modules/ -├── proxy-external/ # Option 1: External Load Balancer -│ ├── main.tf -│ ├── variables.tf -│ ├── outputs.tf -│ └── versions.tf -└── proxy-internal/ # Option 2: Internal Load Balancer - ├── main.tf - ├── variables.tf - ├── outputs.tf - └── versions.tf -``` - -### Variables Configuration -```hcl -variable "proxy_config" { - description = "Proxy configuration" - type = object({ - enabled = bool - type = string # "external", "internal" - domain = string - - # Optional configurations - image_url = optional(string) - - # DNS configuration - dns_zone_name = optional(string) - }) - default = { - enabled = false - type = "external" - domain = "" - } -} -``` - -### Integration with Main Module -```hcl -# In main.tf -module "proxy" { - count = var.proxy_config.enabled ? 1 : 0 - source = "./modules/proxy-${var.proxy_config.type}" - - project_id = var.project_id - region = var.region - name_prefix = local.name_prefix - labels = local.common_labels - - # Networking - vpc_name = local.vpc_name - subnet_name = local.subnet_name - subnet_cidr = var.vpc_config.subnet_cidr - - # Proxy configuration - proxy_domain = var.proxy_config.domain - proxy_config = var.proxy_config - - # Service account - service_account_email = module.security.proxy_service_account_email - - depends_on = [module.services] -} -``` - -### Output Configuration -```hcl -# Proxy outputs -output "proxy_configuration" { - description = "Proxy configuration details" - value = var.proxy_config.enabled ? { - enabled = true - type = var.proxy_config.type - domain = var.proxy_config.domain - load_balancer_ip = try(module.proxy[0].load_balancer_ip, null) - service_url = try(module.proxy[0].service_url, null) - dns_records = try(module.proxy[0].dns_records, []) - ssl_certificate = try(module.proxy[0].ssl_certificate_info, null) - } : null -} -``` - -## Security Considerations - -### Authentication & Authorization -- **OAuth Integration**: Support for SCM OAuth flows -- **JWT Validation**: Verify tokens from Ona management plane -- **RBAC**: Role-based access control for environments -- **Audit Logging**: Log all access attempts and proxy actions - -### Network Security -- **IP Allowlisting**: Restrict access to known IP ranges -- **Rate Limiting**: Prevent abuse and DDoS attacks -- **Header Validation**: Validate and sanitize HTTP headers -- **TLS Configuration**: Strong TLS cipher suites and protocols - -### Certificate Management -- **Automatic Renewal**: Automated certificate renewal process -- **Certificate Monitoring**: Alert on certificate expiration -- **Key Rotation**: Regular rotation of private keys -- **Certificate Pinning**: Optional certificate pinning for enhanced security - -## Deployment Considerations - -### Prerequisites -1. **Domain Ownership**: Customer must own the proxy domain -2. **DNS Management**: Access to configure DNS records or delegate DNS to Google Cloud -3. **Network Planning**: VPC connectivity for internal option (VPN, Interconnect, or peering) - -### Migration Strategy -1. **Parallel Deployment**: Deploy proxy alongside existing gateway -2. **Gradual Cutover**: Route traffic incrementally to proxy -3. **Rollback Plan**: Quick rollback to gateway if issues occur -4. **Monitoring**: Comprehensive monitoring during migration - -### Operational Requirements -- **Health Checks**: Automated health monitoring -- **Certificate Management**: Automated certificate renewal via Google-managed certificates -- **Scaling**: Automatic scaling based on demand -- **Updates**: Rolling updates for proxy service - -## Summary - -This architecture provides two robust options for proxy deployment: - -1. **External Load Balancer**: Best for general use cases with public internet access -2. **Internal Load Balancer**: Best for enterprise environments requiring private network access - -Both options use Google-managed SSL certificates and integrate seamlessly with the existing Ona runner infrastructure, providing a secure and scalable gateway for development environment access. - -## Cost Estimation - -### Assumptions -- **Usage**: 12 hours per day (50% daily utilization) -- **Data Transfer**: 1TB per month total -- **Region**: us-central1 (Iowa) -- **Currency**: USD -- **Pricing**: Based on Google Cloud pricing as of 2024 - -### Option 1: External Load Balancer Cost Breakdown - -| Component | Specification | Monthly Cost | -|-----------|---------------|--------------| -| **Global External Application Load Balancer** | | | -| - Forwarding rules | 1 rule × $18/month | $18.00 | -| - Processing charges | 1TB × $0.008/GB | $8.00 | -| **Cloud Run (Proxy Service)** | | | -| - CPU allocation | 1 vCPU × 12h/day × 30 days × $0.00002400/vCPU-sec | $31.10 | -| - Memory allocation | 2GB × 12h/day × 30 days × $0.00000250/GB-sec | $6.48 | -| - Requests | 1M requests × $0.40/1M requests | $0.40 | -| **Google-Managed SSL Certificate** | Wildcard certificate | $0.00 | -| **Cloud DNS** | | | -| - Hosted zone | 1 zone × $0.20/month | $0.20 | -| - DNS queries | 1M queries × $0.40/1M queries | $0.40 | -| **Egress Traffic** | | | -| - Internet egress | 1TB × $0.12/GB (>1GB tier) | $120.00 | -| **Network Endpoint Group** | Regional NEG | $0.00 | -| | **Total Monthly Cost** | **$184.58** | - -### Option 2: Internal Load Balancer Cost Breakdown - -| Component | Specification | Monthly Cost | -|-----------|---------------|--------------| -| **Internal Application Load Balancer** | | | -| - Forwarding rules | 1 rule × $18/month | $18.00 | -| - Processing charges | 1TB × $0.008/GB | $8.00 | -| **Cloud Run (Proxy Service)** | | | -| - CPU allocation | 1 vCPU × 12h/day × 30 days × $0.00002400/vCPU-sec | $31.10 | -| - Memory allocation | 2GB × 12h/day × 30 days × $0.00000250/GB-sec | $6.48 | -| - Requests | 1M requests × $0.40/1M requests | $0.40 | -| **Google-Managed SSL Certificate** | Wildcard certificate (regional) | $0.00 | -| **Private Cloud DNS** | | | -| - Private hosted zone | 1 zone × $0.20/month | $0.20 | -| - DNS queries | 1M queries × $0.40/1M queries | $0.40 | -| **Internal Traffic** | | | -| - VPC internal traffic | 1TB × $0.01/GB | $10.00 | -| **Network Endpoint Group** | Regional NEG | $0.00 | -| | **Total Monthly Cost** | **$74.58** | - -### Cost Comparison Summary - -| Option | Monthly Cost | Annual Cost | Key Cost Drivers | -|--------|--------------|-------------|------------------| -| **External Load Balancer** | $184.58 | $2,215.00 | Internet egress ($120), LB processing ($26) | -| **Internal Load Balancer** | $74.58 | $895.00 | Internal traffic ($10), LB processing ($26) | -| **Cost Difference** | $110.00 | $1,320.00 | 60% savings with internal option | - -### Cost Optimization Opportunities - -#### For External Load Balancer: -1. **CDN Integration**: Use Cloud CDN to reduce egress costs - - Potential savings: $60-80/month on static content -2. **Committed Use Discounts**: 1-year commitment for predictable workloads - - Potential savings: 20-25% on compute costs -3. **Regional External Load Balancer**: If global access not needed - - Potential savings: $8-10/month on processing - -#### For Internal Load Balancer: -1. **VPC Peering**: Optimize network architecture - - Potential savings: $2-3/month on internal traffic -2. **Resource Right-sizing**: Adjust Cloud Run CPU/memory based on actual usage - - Potential savings: $10-15/month if over-provisioned - -### Scaling Cost Projections - -#### Data Transfer Scaling (Internal LB): -| Monthly Data | Processing Cost | Internal Traffic | Total Additional | -|--------------|-----------------|------------------|------------------| -| 2TB | $16.00 | $20.00 | $36.00 | -| 5TB | $40.00 | $50.00 | $90.00 | -| 10TB | $80.00 | $100.00 | $180.00 | - -#### Usage Scaling (both options): -| Daily Hours | CPU Cost | Memory Cost | Total Compute | -|-------------|----------|-------------|---------------| -| 6h (25%) | $15.55 | $3.24 | $18.79 | -| 12h (50%) | $31.10 | $6.48 | $37.58 | -| 24h (100%) | $62.20 | $12.96 | $75.16 | - -### Key Cost Insights - -1. **Internal is 60% cheaper**: Primarily due to lower network egress costs -2. **Network costs dominate**: 65% of external LB costs are network-related -3. **Compute costs are consistent**: Same Cloud Run costs for both options -4. **Load balancer overhead**: ~$26/month baseline for either option -5. **Break-even point**: Internal option pays for itself after 1 month - -### Cost Monitoring Recommendations - -1. **Set up billing alerts** at $50, $100, and $150 monthly thresholds -2. **Monitor network egress** usage patterns for optimization opportunities -3. **Track Cloud Run metrics** to right-size CPU and memory allocation -4. **Review DNS query patterns** to optimize caching strategies -5. **Consider regional vs global** load balancer based on user geography - -*Note: Prices are estimates based on Google Cloud pricing and may vary based on actual usage patterns, regional availability, and pricing changes.* diff --git a/docs/release-notifications.md b/docs/release-notifications.md deleted file mode 100644 index 0f3ee03..0000000 --- a/docs/release-notifications.md +++ /dev/null @@ -1,144 +0,0 @@ -# GCP Runner Release Notifications - -Ona publishes Pub/Sub messages when new GCP runner releases are available. Subscribe to receive push notifications instead of polling for updates. - -## Topic - -| Property | Value | -|---|---| -| Project | `gitpod-next-production` | -| Topic | `gcp-runner-releases` | -| Full name | `projects/gitpod-next-production/topics/gcp-runner-releases` | -| Message retention | 7 days | -| Access | Any authenticated GCP user can subscribe | - -## Events - -Notifications are published only for **stable** releases. - -| Event Type | Description | -|---|---| -| `release.stable` | Release promoted to stable | - -## Message Format - -### Attributes - -Every message includes these attributes for filtering: - -- `event_type` — `release.stable` -- `version` — Release version string (e.g., `20250115.0`) -- `source` — What triggered the notification: - - `ci_stable_promotion` — Release promoted to stable - - `gcs_notification` — Stable manifest updated in GCS (automatic) - -### Payload - -CI-published messages contain the release manifest plus Terraform module change context: - -```json -{ - "version": "20250115.0", - "commit": "abc123def", - "release_date": "2025-01-15T00:30:00Z", - "infrastructure_version": "latest", - "proxy_image": "us-docker.pkg.dev/gitpod-next-production/gitpod-next/gitpod-proxy:20250115.0", - "runner_image": "us-docker.pkg.dev/gitpod-next-production/gitpod-next/gitpod-gcp-runner:20250115.0", - "prometheus_image": "us-docker.pkg.dev/gitpod-next-production/gitpod-next/prometheus:v3.5.0", - "supervisor_url": "https://storage.googleapis.com/gitpod-runner-releases/gcp/releases/20250115.0/supervisor-amd64.xz", - "supervisor_version": "20250115.0", - "cli_url": "https://storage.googleapis.com/gitpod-runner-releases/gcp/releases/20250115.0/gitpod-linux-amd64", - "download_url": "https://storage.googleapis.com/gitpod-runner-releases/gcp/releases/20250115.0/gitpod-gcp-manifest.json", - "vm_image": "projects/gitpod-next-production/global/images/ona-environment-20250115-1041", - "terraform_changes": [ - "- Add network firewall rule for proxy health checks (a1b2c3d)", - "- Update default machine type to n2d-standard-16 (e4f5g6h)" - ], - "iam_changes_detected": false -} -``` - -| Field | Type | Description | -|---|---|---| -| `terraform_changes` | `string[]` | Terraform module commits since the previous release, excluding automated image updates. Empty array if no changes. | -| `iam_changes_detected` | `boolean` | `true` if IAM-related files (`iam.tf`, `docs/iam.md`, etc.) changed in this release. Signals that IAM configuration may need updating. | - -GCS notifications use the standard [Cloud Storage notification format](https://cloud.google.com/storage/docs/pubsub-notifications#payload). - -## Subscribing - -### Using gcloud - -Create a pull subscription in your project: - -```bash -gcloud pubsub subscriptions create ona-runner-releases \ - --project=YOUR_PROJECT_ID \ - --topic=projects/gitpod-next-production/topics/gcp-runner-releases \ - --ack-deadline=60 -``` - -Pull messages: - -```bash -gcloud pubsub subscriptions pull ona-runner-releases \ - --project=YOUR_PROJECT_ID \ - --auto-ack \ - --limit=10 -``` - -### Using Terraform - -```hcl -resource "google_pubsub_subscription" "ona_runner_releases" { - name = "ona-runner-releases" - project = var.project_id - - topic = "projects/gitpod-next-production/topics/gcp-runner-releases" - - ack_deadline_seconds = 60 - - # Optional: receive only CI-published messages (skip GCS notification duplicates) - # filter = "attributes.source = \"ci_stable_promotion\"" - - # Optional: dead-letter policy - # dead_letter_policy { - # dead_letter_topic = google_pubsub_topic.dead_letter.id - # max_delivery_attempts = 5 - # } - - # Optional: retry policy - retry_policy { - minimum_backoff = "10s" - maximum_backoff = "600s" - } -} -``` - -### Filtering - -Use [Pub/Sub subscription filters](https://cloud.google.com/pubsub/docs/subscription-message-filter) to receive only the events you care about: - -| Filter | Effect | -|---|---| -| `attributes.source = "ci_stable_promotion"` | CI-published stable promotions only (full manifest payload, no GCS duplicate) | -| `attributes.source = "gcs_notification"` | GCS-triggered notifications only | - -### Push Subscription - -To receive notifications via HTTP webhook: - -```hcl -resource "google_pubsub_subscription" "ona_runner_releases_push" { - name = "ona-runner-releases-push" - project = var.project_id - - topic = "projects/gitpod-next-production/topics/gcp-runner-releases" - - push_config { - push_endpoint = "https://your-endpoint.example.com/ona-releases" - } - - filter = "attributes.source = \"ci_stable_promotion\"" -} -``` diff --git a/examples/runner-with-networking/main.tf b/examples/runner-with-networking/main.tf index ead4c74..665eb65 100644 --- a/examples/runner-with-networking/main.tf +++ b/examples/runner-with-networking/main.tf @@ -172,6 +172,10 @@ module "runner" { create_cmek = var.create_cmek kms_key_name = var.kms_key_name + custom_images = var.custom_images + enable_agents = var.enable_agents + labels = var.labels + depends_on = [module.networking, module.dns, module.self_signed_cert, module.certbot] } diff --git a/examples/runner-with-networking/variables.tf b/examples/runner-with-networking/variables.tf index b4526b6..7b81787 100644 --- a/examples/runner-with-networking/variables.tf +++ b/examples/runner-with-networking/variables.tf @@ -158,6 +158,7 @@ variable "create_cmek" { default = false } + variable "kms_key_name" { description = "The KMS key name for CMEK encryption of GCP resources. Only used when create_cmek = false. Ignored when create_cmek = true." type = string @@ -168,3 +169,35 @@ variable "kms_key_name" { error_message = "The kms_key_name must be in the format projects/{project}/locations/{location}/keyRings/{keyring}/cryptoKeys/{key}." } } + +variable "custom_images" { + description = "Custom Docker images to use instead of default ones. Optionally includes Docker config.json content for registry credentials and insecure registry flag." + type = object({ + runner_image = optional(string, "") + proxy_image = optional(string, "") + prometheus_image = optional(string, "") + node_exporter_image = optional(string, "") + docker_config_json = optional(string, "") # Docker config.json content (JSON string) + insecure = optional(bool, false) # Mark custom image registries as insecure + }) + default = { + runner_image = "" + proxy_image = "" + prometheus_image = "" + node_exporter_image = "" + docker_config_json = "" + insecure = false + } + + validation { + condition = var.custom_images.docker_config_json == "" || can(jsondecode(var.custom_images.docker_config_json)) + error_message = "docker_config_json must be empty or valid JSON string." + } +} + +variable "enable_agents" { + description = "Enable LLM agents execution feature in your Ona environments" + type = bool + default = true +} + diff --git a/files/proxy-cloud-init.tftpl b/files/proxy-cloud-init.tftpl index 3173704..b43f714 100644 --- a/files/proxy-cloud-init.tftpl +++ b/files/proxy-cloud-init.tftpl @@ -277,6 +277,8 @@ write_files: ExecStart=/usr/bin/docker%{ if DOCKER_CONFIG_ENABLED } --config /var/lib/gitpod/docker-config%{ endif } run --rm --name prometheus \ --network host \ --hostname %H \ + --memory=512m \ + --cpus=0.25 \ --volume /var/lib/prometheus:/etc/prometheus:ro \ --volume /var/lib/prometheus/data:/prometheus \ %{ if CA_ENABLED ~} @@ -336,6 +338,8 @@ write_files: ExecStart=/usr/bin/docker%{ if DOCKER_CONFIG_ENABLED } --config /var/lib/gitpod/docker-config%{ endif } run --rm --name node-exporter \ --network host \ --pid host \ + --memory=256m \ + --cpus=0.25 \ --volume /:/host:ro,rslave \ %{ if CA_ENABLED ~} --volume /var/lib/gitpod/certs/gitpod-custom-ca.crt:/etc/ssl/certs/gitpod-trust-bundle.crt:ro \ @@ -428,14 +432,29 @@ write_files: REMOTE_URL=$(echo "$SECRET_DATA" | jq -r '.url // ""') REMOTE_USER=$(echo "$SECRET_DATA" | jq -r '.user // ""') REMOTE_PASSWORD=$(echo "$SECRET_DATA" | jq -r '.password // ""') + MANAGED_ENDPOINT_URL=$(echo "$SECRET_DATA" | jq -r '.managedEndpointUrl // ""') + MANAGED_BEARER_TOKEN=$(echo "$SECRET_DATA" | jq -r '.managedBearerToken // ""') + RUNNER_ID=$(echo "$SECRET_DATA" | jq -r '.runnerId // ""') + ORGANIZATION_ID=$(echo "$SECRET_DATA" | jq -r '.organizationId // ""') + # Read allowlist as a bash array of prefixes + mapfile -t ALLOWLIST_PREFIXES < <(echo "$SECRET_DATA" | jq -r '.allowlistPrefixes // [] | .[]') - log "Metrics enabled: $ENABLE_METRICS, URL: $REMOTE_URL" + log "Metrics enabled: $ENABLE_METRICS, URL: $REMOTE_URL, managed endpoint: $MANAGED_ENDPOINT_URL, runner_id: $RUNNER_ID, organization_id: $ORGANIZATION_ID, allowlist prefixes: $${#ALLOWLIST_PREFIXES[@]}" # Generate final configuration using template substitution sed -e "s/{{INSTANCE_NAME}}/$INSTANCE_NAME/g" \ /var/lib/prometheus/prometheus-template.yml > /tmp/prometheus.yml.new + # Add runner_id and organization_id to external_labels if available + if [ -n "$RUNNER_ID" ]; then + sed -i "/external_labels:/a\\ runner_id: $RUNNER_ID" /tmp/prometheus.yml.new + fi + if [ -n "$ORGANIZATION_ID" ]; then + sed -i "/external_labels:/a\\ organization_id: $ORGANIZATION_ID" /tmp/prometheus.yml.new + fi + # Add remote write configuration if metrics are enabled + HAS_REMOTE_WRITE=false if [ "$ENABLE_METRICS" = "true" ] && [ -n "$REMOTE_URL" ]; then log "Adding remote write configuration" @@ -445,6 +464,7 @@ write_files: # Add remote write configuration using echo echo "" >> /tmp/prometheus.yml.new echo "remote_write:" >> /tmp/prometheus.yml.new + HAS_REMOTE_WRITE=true echo " - url: $REMOTE_URL" >> /tmp/prometheus.yml.new # Add basic auth if credentials are provided @@ -458,6 +478,49 @@ write_files: sed -i '/{{REMOTE_WRITE_CONFIG}}/d' /tmp/prometheus.yml.new fi + # Build allowlist regex once — reused by both managed and local targets. + ALLOWLIST_REGEX="" + if [ $${#ALLOWLIST_PREFIXES[@]} -gt 0 ]; then + ALLOWLIST_REGEX="(" + for i in "$${!ALLOWLIST_PREFIXES[@]}"; do + if [ "$i" -gt 0 ]; then ALLOWLIST_REGEX+="|"; fi + ALLOWLIST_REGEX+="$${ALLOWLIST_PREFIXES[$i]}.*" + done + ALLOWLIST_REGEX+=")" + fi + + # Helper: append write_relabel_configs for the allowlist. + append_allowlist_relabel() { + if [ -n "$ALLOWLIST_REGEX" ]; then + echo " write_relabel_configs:" >> /tmp/prometheus.yml.new + echo " - source_labels: [__name__]" >> /tmp/prometheus.yml.new + echo " regex: '$ALLOWLIST_REGEX'" >> /tmp/prometheus.yml.new + echo " action: keep" >> /tmp/prometheus.yml.new + fi + } + + # Add managed endpoint direct push (preferred over local receiver). + # Uses a scoped JWT to push metrics directly to the management plane. + if [ -n "$MANAGED_ENDPOINT_URL" ] && [ -n "$MANAGED_BEARER_TOKEN" ]; then + log "Adding managed endpoint remote write target: $MANAGED_ENDPOINT_URL" + if [ "$HAS_REMOTE_WRITE" = "false" ]; then + echo "" >> /tmp/prometheus.yml.new + echo "remote_write:" >> /tmp/prometheus.yml.new + HAS_REMOTE_WRITE=true + fi + echo " - url: $MANAGED_ENDPOINT_URL" >> /tmp/prometheus.yml.new + echo " authorization:" >> /tmp/prometheus.yml.new + echo " type: Bearer" >> /tmp/prometheus.yml.new + echo " credentials: $MANAGED_BEARER_TOKEN" >> /tmp/prometheus.yml.new + append_allowlist_relabel + + # Audit: send the same filtered payload to the local audit receiver + # which persists each write to GCS for customer audit trails. + log "Adding metrics audit receiver remote write target" + echo " - url: http://127.0.0.1:9095/write" >> /tmp/prometheus.yml.new + append_allowlist_relabel + fi + # Check if configuration changed if [ -f /var/lib/prometheus/prometheus.yml ] && cmp -s /tmp/prometheus.yml.new /var/lib/prometheus/prometheus.yml; then log "Configuration unchanged, skipping update" @@ -516,6 +579,92 @@ write_files: [Install] WantedBy=timers.target + # Metrics audit receiver — accepts Prometheus remote_write POSTs and + # writes each payload to GCS so customers can audit exactly what data + # leaves their network. Listens on 127.0.0.1:9095. + - path: /var/lib/gitpod/metrics-audit-receiver.py + permissions: '0755' + content: | + #!/usr/bin/env python3 + """Receives Prometheus remote_write payloads and writes them to GCS.""" + import os + import subprocess + import sys + import time + from http.server import HTTPServer, BaseHTTPRequestHandler + + LISTEN_ADDR = "127.0.0.1" + LISTEN_PORT = 9095 + BUCKET = os.environ.get("RUNNER_ASSETS_BUCKET_NAME", "") + RUNNER_ID = os.environ.get("RUNNER_ID", "") + + class AuditHandler(BaseHTTPRequestHandler): + def do_POST(self): + length = int(self.headers.get("Content-Length", 0)) + if length == 0: + self.send_response(204) + self.end_headers() + return + + body = self.rfile.read(length) + now = time.gmtime() + key = "metrics/runner/{rid}/{y}/{m:02d}/{d:02d}/{H:02d}{M:02d}{S:02d}.pb.snappy".format( + rid=RUNNER_ID, + y=now.tm_year, m=now.tm_mon, d=now.tm_mday, + H=now.tm_hour, M=now.tm_min, S=now.tm_sec, + ) + dst = "gs://{}/{}".format(BUCKET, key) + + try: + proc = subprocess.run( + ["gcloud", "storage", "cp", "-", dst], + input=body, capture_output=True, timeout=30, + ) + if proc.returncode != 0: + sys.stderr.write("gcloud cp failed: {}\n".format(proc.stderr.decode())) + self.send_response(502) + self.end_headers() + return + except Exception as e: + sys.stderr.write("audit write error: {}\n".format(e)) + self.send_response(502) + self.end_headers() + return + + self.send_response(204) + self.end_headers() + + def log_message(self, fmt, *args): + pass # suppress per-request access logs + + if not BUCKET or not RUNNER_ID: + sys.stderr.write("RUNNER_ASSETS_BUCKET_NAME or RUNNER_ID not set, exiting\n") + sys.exit(1) + + server = HTTPServer((LISTEN_ADDR, LISTEN_PORT), AuditHandler) + sys.stderr.write("metrics-audit-receiver listening on {}:{}\n".format(LISTEN_ADDR, LISTEN_PORT)) + server.serve_forever() + + # Systemd service for the metrics audit receiver + - path: /var/lib/systemd/system/metrics-audit-receiver.service + permissions: '0644' + content: | + [Unit] + Description=Metrics Audit Receiver + After=network.target + Before=prometheus.service + + [Service] + Type=simple + Restart=always + RestartSec=5s + Environment=RUNNER_ASSETS_BUCKET_NAME=${RUNNER_ASSETS_BUCKET_NAME} + Environment=RUNNER_ID=${RUNNER_ID} + ExecStart=/var/lib/gitpod/metrics-audit-receiver.py + + [Install] + WantedBy=multi-user.target + %{ if CERTIFICATE_SECRET_ID != "" ~} # Certificate refresh script - polls Secret Manager and updates cert files on disk - path: /var/lib/gitpod/scripts/cert-refresh.sh @@ -655,6 +804,8 @@ write_files: ExecStartPre=-/usr/bin/docker rm gitpod-proxy ExecStart=/usr/bin/docker%{ if DOCKER_CONFIG_ENABLED } --config /var/lib/gitpod/docker-config%{ endif } run --rm --name gitpod-proxy \ --network host \ + --memory=${PROXY_CONTAINER_MEMORY} \ + --cpus=${PROXY_CONTAINER_CPUS} \ --volume /var/lib/gitpod/tls:/tmp/certs:rw \ %{ if CA_ENABLED ~} --volume /var/lib/gitpod/certs/gitpod-custom-ca.crt:/etc/ssl/certs/gitpod-trust-bundle.crt:ro \ @@ -865,6 +1016,7 @@ write_files: ln -sf /var/lib/systemd/system/node-exporter.service /etc/systemd/system/node-exporter.service ln -sf /var/lib/systemd/system/prometheus-config-updater.service /etc/systemd/system/prometheus-config-updater.service ln -sf /var/lib/systemd/system/prometheus-config-updater.timer /etc/systemd/system/prometheus-config-updater.timer + ln -sf /var/lib/systemd/system/metrics-audit-receiver.service /etc/systemd/system/metrics-audit-receiver.service %{ if CERTIFICATE_SECRET_ID != "" ~} ln -sf /var/lib/systemd/system/cert-refresh.service /etc/systemd/system/cert-refresh.service ln -sf /var/lib/systemd/system/cert-refresh.timer /etc/systemd/system/cert-refresh.timer @@ -896,6 +1048,7 @@ write_files: systemctl enable prometheus systemctl enable node-exporter systemctl enable prometheus-config-updater.timer + systemctl enable metrics-audit-receiver.service %{ if CERTIFICATE_SECRET_ID != "" ~} systemctl enable cert-refresh.timer %{ endif ~} @@ -913,6 +1066,9 @@ write_files: echo "Starting Prometheus Config Updater Timer..." systemctl start prometheus-config-updater.timer + echo "Starting Metrics Audit Receiver..." + systemctl start metrics-audit-receiver.service + %{ if CERTIFICATE_SECRET_ID != "" ~} echo "Starting Certificate Refresh Timer..." systemctl start cert-refresh.timer diff --git a/files/runner-cloud-init.tftpl b/files/runner-cloud-init.tftpl index f2e4304..2ede3a6 100644 --- a/files/runner-cloud-init.tftpl +++ b/files/runner-cloud-init.tftpl @@ -227,6 +227,7 @@ write_files: scrape_timeout: 10s evaluation_interval: 15s external_labels: + stack: gcp project_id: ${PROJECT_ID} region: ${REGION} runner_name: ${RUNNER_ID} @@ -295,6 +296,8 @@ write_files: ExecStart=/usr/bin/docker%{ if DOCKER_CONFIG_ENABLED } --config /var/lib/gitpod/docker-config%{ endif } run --rm --name prometheus \ --network host \ --hostname %H \ + --memory=1g \ + --cpus=0.5 \ --volume /var/lib/prometheus:/etc/prometheus:ro \ --volume /var/lib/prometheus/data:/prometheus \ --volume /var/lib/gitpod/certs/gitpod-custom-ca.crt:/etc/ssl/certs/gitpod-trust-bundle.crt:ro \ @@ -343,6 +346,8 @@ write_files: ExecStart=/usr/bin/docker%{ if DOCKER_CONFIG_ENABLED } --config /var/lib/gitpod/docker-config%{ endif } run --rm --name node-exporter \ --network host \ --pid host \ + --memory=256m \ + --cpus=0.25 \ --volume /:/host:ro,rslave \ --volume /var/lib/gitpod/certs/gitpod-custom-ca.crt:/etc/ssl/certs/gitpod-trust-bundle.crt:ro \ --env http_proxy=${HTTP_PROXY} \ @@ -421,34 +426,101 @@ write_files: REMOTE_URL=$(echo "$SECRET_DATA" | jq -r '.url // ""') REMOTE_USER=$(echo "$SECRET_DATA" | jq -r '.user // ""') REMOTE_PASSWORD=$(echo "$SECRET_DATA" | jq -r '.password // ""') + LOCAL_REMOTE_WRITE_URL=$(echo "$SECRET_DATA" | jq -r '.localRemoteWriteUrl // ""') + MANAGED_ENDPOINT_URL=$(echo "$SECRET_DATA" | jq -r '.managedEndpointUrl // ""') + MANAGED_BEARER_TOKEN=$(echo "$SECRET_DATA" | jq -r '.managedBearerToken // ""') + RUNNER_ID=$(echo "$SECRET_DATA" | jq -r '.runnerId // ""') + ORGANIZATION_ID=$(echo "$SECRET_DATA" | jq -r '.organizationId // ""') + # Read allowlist as a bash array of prefixes + mapfile -t ALLOWLIST_PREFIXES < <(echo "$SECRET_DATA" | jq -r '.allowlistPrefixes // [] | .[]') - log "Metrics enabled: $ENABLE_METRICS, URL: $REMOTE_URL" + log "Metrics enabled: $ENABLE_METRICS, URL: $REMOTE_URL, managed endpoint: $MANAGED_ENDPOINT_URL, local remote write: $LOCAL_REMOTE_WRITE_URL, runner_id: $RUNNER_ID, organization_id: $ORGANIZATION_ID, allowlist prefixes: $${#ALLOWLIST_PREFIXES[@]}" # Generate final configuration using template substitution sed -e "s/{{INSTANCE_NAME}}/$INSTANCE_NAME/g" \ /var/lib/prometheus/prometheus-template.yml > /tmp/prometheus.yml.new - # Add remote write configuration if metrics are enabled + # Add runner_id and organization_id to external_labels if available + if [ -n "$RUNNER_ID" ]; then + sed -i "/external_labels:/a\\ runner_id: $RUNNER_ID" /tmp/prometheus.yml.new + fi + if [ -n "$ORGANIZATION_ID" ]; then + sed -i "/external_labels:/a\\ organization_id: $ORGANIZATION_ID" /tmp/prometheus.yml.new + fi + + # Build remote_write configuration from available targets + HAS_REMOTE_WRITE=false + + # Remove the placeholder line first + sed -i '/{{REMOTE_WRITE_CONFIG}}/d' /tmp/prometheus.yml.new + + # Add Grafana Cloud remote write if metrics are enabled if [ "$ENABLE_METRICS" = "true" ] && [ -n "$REMOTE_URL" ]; then - log "Adding remote write configuration" - - # Remove the placeholder line - sed -i '/{{REMOTE_WRITE_CONFIG}}/d' /tmp/prometheus.yml.new - - # Add remote write configuration using echo - echo "" >> /tmp/prometheus.yml.new - echo "remote_write:" >> /tmp/prometheus.yml.new + log "Adding Grafana Cloud remote write target" + if [ "$HAS_REMOTE_WRITE" = "false" ]; then + echo "" >> /tmp/prometheus.yml.new + echo "remote_write:" >> /tmp/prometheus.yml.new + HAS_REMOTE_WRITE=true + fi echo " - url: $REMOTE_URL" >> /tmp/prometheus.yml.new - - # Add basic auth if credentials are provided if [ -n "$REMOTE_USER" ] && [ -n "$REMOTE_PASSWORD" ]; then echo " basic_auth:" >> /tmp/prometheus.yml.new echo " username: $REMOTE_USER" >> /tmp/prometheus.yml.new echo " password: $REMOTE_PASSWORD" >> /tmp/prometheus.yml.new fi - else - # Remove the placeholder line when metrics are disabled - sed -i '/{{REMOTE_WRITE_CONFIG}}/d' /tmp/prometheus.yml.new + fi + + # Build allowlist regex once — reused by both managed and local targets. + ALLOWLIST_REGEX="" + if [ $${#ALLOWLIST_PREFIXES[@]} -gt 0 ]; then + ALLOWLIST_REGEX="(" + for i in "$${!ALLOWLIST_PREFIXES[@]}"; do + if [ "$i" -gt 0 ]; then ALLOWLIST_REGEX+="|"; fi + ALLOWLIST_REGEX+="$${ALLOWLIST_PREFIXES[$i]}.*" + done + ALLOWLIST_REGEX+=")" + fi + + # Helper: append write_relabel_configs for the allowlist. + append_allowlist_relabel() { + if [ -n "$ALLOWLIST_REGEX" ]; then + echo " write_relabel_configs:" >> /tmp/prometheus.yml.new + echo " - source_labels: [__name__]" >> /tmp/prometheus.yml.new + echo " regex: '$ALLOWLIST_REGEX'" >> /tmp/prometheus.yml.new + echo " action: keep" >> /tmp/prometheus.yml.new + fi + } + + # Add managed endpoint direct push (preferred over local receiver). + # Uses a scoped JWT to push metrics directly to the management plane. + if [ -n "$MANAGED_ENDPOINT_URL" ] && [ -n "$MANAGED_BEARER_TOKEN" ]; then + log "Adding managed endpoint remote write target: $MANAGED_ENDPOINT_URL" + if [ "$HAS_REMOTE_WRITE" = "false" ]; then + echo "" >> /tmp/prometheus.yml.new + echo "remote_write:" >> /tmp/prometheus.yml.new + HAS_REMOTE_WRITE=true + fi + echo " - url: $MANAGED_ENDPOINT_URL" >> /tmp/prometheus.yml.new + echo " authorization:" >> /tmp/prometheus.yml.new + echo " type: Bearer" >> /tmp/prometheus.yml.new + echo " credentials: $MANAGED_BEARER_TOKEN" >> /tmp/prometheus.yml.new + append_allowlist_relabel + + # Audit: send the same filtered payload to the local audit receiver + # which persists each write to GCS for customer audit trails. + log "Adding metrics audit receiver remote write target" + echo " - url: http://127.0.0.1:9095/write" >> /tmp/prometheus.yml.new + append_allowlist_relabel + elif [ -n "$LOCAL_REMOTE_WRITE_URL" ]; then + # Fallback: local remote write target for managed metrics pipeline. + log "Adding local remote write target: $LOCAL_REMOTE_WRITE_URL" + if [ "$HAS_REMOTE_WRITE" = "false" ]; then + echo "" >> /tmp/prometheus.yml.new + echo "remote_write:" >> /tmp/prometheus.yml.new + HAS_REMOTE_WRITE=true + fi + echo " - url: $LOCAL_REMOTE_WRITE_URL" >> /tmp/prometheus.yml.new + append_allowlist_relabel fi # Check if configuration changed @@ -534,6 +606,8 @@ write_files: INSTANCE_GROUP_NAME=${INSTANCE_GROUP_NAME} BUILD_CACHE_BUCKET=${BUILD_CACHE_BUCKET} GITPOD_DEVELOPMENT_VERSION=${DEVELOPMENT_VERSION} + MANAGED_METRICS_DIRECT_PUSH=true + RUNNER_ASSETS_BUCKET_NAME=${RUNNER_ASSETS_BUCKET_NAME} PUBSUB_SUBSCRIPTION_ID=${PUBSUB_SUBSCRIPTION_ID} AUTH_PROXY_URL=${AUTH_PROXY_URL} RUNNER_LOGS_URL="${RUNNER_LOGS_URL}" @@ -541,10 +615,6 @@ write_files: https_proxy=${HTTPS_PROXY} all_proxy=${ALL_PROXY} no_proxy=${NO_PROXY} -%{ if HONEYCOMB_API_KEY != "" ~} - HONEYCOMB_API_KEY=${HONEYCOMB_API_KEY} -%{ endif ~} - %{ if AUTH_PROXY_TLS_CERT != "" ~} # Auth proxy TLS certificate for verification (CA trust) - path: /var/lib/gitpod/auth-proxy-ca.crt @@ -586,6 +656,8 @@ write_files: ExecStart=/usr/bin/docker%{ if DOCKER_CONFIG_ENABLED } --config /var/lib/gitpod/docker-config%{ endif } run --rm --name gitpod-auth-proxy \ --network host \ --hostname %H \ + --memory=512m \ + --cpus=0.5 \ --volume /var/lib/gitpod/certs:/var/lib/gitpod/certs:ro \ %{ if HAS_TRUST_BUNDLE ~} --volume /var/lib/gitpod/certs/gitpod-custom-ca.crt:/etc/ssl/certs/gitpod-trust-bundle.crt:ro \ @@ -634,12 +706,16 @@ write_files: ExecStart=/usr/bin/docker%{ if DOCKER_CONFIG_ENABLED } --config /var/lib/gitpod/docker-config%{ endif } run --rm --name gitpod-runner \ --network host \ --hostname %H \ + --memory=${RUNNER_CONTAINER_MEMORY} \ + --cpus=${RUNNER_CONTAINER_CPUS} \ --volume /var/lib/prometheus:/var/lib/prometheus \ --volume /tmp:/tmp \ --env http_proxy=${HTTP_PROXY} \ --env https_proxy=${HTTPS_PROXY} \ --env all_proxy=${ALL_PROXY} \ --env GITPOD_DEVELOPMENT_VERSION=${DEVELOPMENT_VERSION} \ + --env MANAGED_METRICS_DIRECT_PUSH=true \ + --env GITPOD_TERRAFORM_MODULE_VERSION=${TERRAFORM_MODULE_VERSION} \ --env no_proxy=${NO_PROXY} \ %{ if HAS_TRUST_BUNDLE ~} --volume /var/lib/gitpod/certs/gitpod-custom-ca.crt:/etc/ssl/certs/gitpod-trust-bundle.crt:ro \ @@ -683,7 +759,9 @@ write_files: --prometheus-secret-name=${METRICS_SECRET_ID} \ --agent-execution-feature-enabled=${ENABLE_AGENTS} \ --agent-bucket-name=${AGENT_BUCKET_NAME} \ - --mig-warm-pool-enabled=${MIG_WARM_POOL_ENABLED} + --runner-assets-bucket-name=${RUNNER_ASSETS_BUCKET_NAME} \ + --mig-warm-pool-enabled=${MIG_WARM_POOL_ENABLED} \ + --port-auth-enabled=true ExecStopPre=-/bin/bash -c 'timeout 10 /var/lib/gitpod/flush-metrics.sh || true' ExecStop=/usr/bin/docker stop gitpod-runner @@ -816,6 +894,91 @@ write_files: # Execute main function main "$@" + # Metrics audit receiver — accepts Prometheus remote_write POSTs and + # writes each payload to GCS so customers can audit exactly what data + # leaves their network. Listens on 127.0.0.1:9095. + - path: /var/lib/gitpod/metrics-audit-receiver.py + permissions: '0755' + content: | + #!/usr/bin/env python3 + """Receives Prometheus remote_write payloads and writes them to GCS.""" + import os + import subprocess + import sys + import time + from http.server import HTTPServer, BaseHTTPRequestHandler + + LISTEN_ADDR = "127.0.0.1" + LISTEN_PORT = 9095 + BUCKET = os.environ.get("RUNNER_ASSETS_BUCKET_NAME", "") + RUNNER_ID = os.environ.get("RUNNER_ID", "") + + class AuditHandler(BaseHTTPRequestHandler): + def do_POST(self): + length = int(self.headers.get("Content-Length", 0)) + if length == 0: + self.send_response(204) + self.end_headers() + return + + body = self.rfile.read(length) + now = time.gmtime() + key = "metrics/runner/{rid}/{y}/{m:02d}/{d:02d}/{H:02d}{M:02d}{S:02d}.pb.snappy".format( + rid=RUNNER_ID, + y=now.tm_year, m=now.tm_mon, d=now.tm_mday, + H=now.tm_hour, M=now.tm_min, S=now.tm_sec, + ) + dst = "gs://{}/{}".format(BUCKET, key) + + try: + proc = subprocess.run( + ["gcloud", "storage", "cp", "-", dst], + input=body, capture_output=True, timeout=30, + ) + if proc.returncode != 0: + sys.stderr.write("gcloud cp failed: {}\n".format(proc.stderr.decode())) + self.send_response(502) + self.end_headers() + return + except Exception as e: + sys.stderr.write("audit write error: {}\n".format(e)) + self.send_response(502) + self.end_headers() + return + + self.send_response(204) + self.end_headers() + + def log_message(self, fmt, *args): + pass # suppress per-request access logs + + if not BUCKET or not RUNNER_ID: + sys.stderr.write("RUNNER_ASSETS_BUCKET_NAME or RUNNER_ID not set, exiting\n") + sys.exit(1) + + server = HTTPServer((LISTEN_ADDR, LISTEN_PORT), AuditHandler) + sys.stderr.write("metrics-audit-receiver listening on {}:{}\n".format(LISTEN_ADDR, LISTEN_PORT)) + server.serve_forever() + + # Systemd service for the metrics audit receiver + - path: /var/lib/systemd/system/metrics-audit-receiver.service + permissions: '0644' + content: | + [Unit] + Description=Metrics Audit Receiver + After=network.target + Before=prometheus.service + + [Service] + Type=simple + Restart=always + RestartSec=5s + EnvironmentFile=/var/lib/gitpod/runner.env + ExecStart=/var/lib/gitpod/metrics-audit-receiver.py + + [Install] + WantedBy=multi-user.target + # Enhanced startup script with better error handling and validation - path: /tmp/container-startup.sh permissions: '0755' @@ -954,6 +1117,7 @@ write_files: "node-exporter.service" "prometheus-config-updater.service" "prometheus-config-updater.timer" + "metrics-audit-receiver.service" "gitpod-auth-proxy.service" "gitpod-runner.service" ) @@ -1215,5 +1379,6 @@ runcmd: # Ensure port ports are open - iptables -A INPUT -p tcp --dport 9091 -j ACCEPT # Allow health check port from all sources (GCP firewall restricts to probes) - iptables -A INPUT -p tcp --dport 4430 -j ACCEPT # Auth proxy port + - iptables -A INPUT -p tcp --dport 7070 -j ACCEPT # Port auth service - iptables -A INPUT -p tcp --dport 8080 -j ACCEPT # If needed for your runner service - iptables -A INPUT -p tcp --dport 22 -j ACCEPT # SSH is already allowed, but harmless to add diff --git a/firewall.tf b/firewall.tf index 05aadcc..4152274 100644 --- a/firewall.tf +++ b/firewall.tf @@ -61,7 +61,7 @@ resource "google_compute_firewall" "allow_proxy_to_runner_backend" { allow { protocol = "tcp" - ports = [tostring(var.service_ports.runner_http_port), "4430"] + ports = [tostring(var.service_ports.runner_http_port), "4430", "7070"] } source_tags = ["gitpod-proxy"] @@ -86,7 +86,8 @@ resource "google_compute_firewall" "deny_environments_to_services" { tostring(var.service_ports.runner_health_port), tostring(var.service_ports.proxy_https_port), tostring(var.service_ports.proxy_http_port), - "4430" + "4430", + "7070" ] } @@ -98,6 +99,10 @@ resource "google_compute_firewall" "deny_environments_to_services" { target_tags = ["gitpod-runner", "gitpod-proxy"] priority = 1000 # Higher priority than allow rules + log_config { + metadata = "INCLUDE_ALL_METADATA" + } + # depends on proxy vm depends_on = [google_compute_backend_service.proxy] } @@ -118,6 +123,10 @@ resource "google_compute_firewall" "allow_iap_to_environments" { source_ranges = ["35.235.240.0/20"] target_tags = ["gitpod-type-environment"] + log_config { + metadata = "INCLUDE_ALL_METADATA" + } + # depends on proxy vm depends_on = [google_compute_backend_service.proxy] } @@ -163,6 +172,10 @@ resource "google_compute_firewall" "deny_email_from_environments" { destination_ranges = ["0.0.0.0/0"] target_tags = ["gitpod-type-environment"] + log_config { + metadata = "INCLUDE_ALL_METADATA" + } + # depends on proxy vm depends_on = [google_compute_backend_service.proxy] } @@ -194,23 +207,42 @@ resource "google_compute_firewall" "runner_health_check" { target_tags = ["allow-health-check", "lb-health-check"] } -# Firewall rule to allow internal traffic +# Firewall rule to allow service traffic to runner instances. +# SSH (22) is intentionally excluded — runner-to-environment SSH uses +# tag-based rules, and operator SSH should go through IAP (35.235.240.0/20). resource "google_compute_firewall" "runner_internal_traffic" { name = "${var.runner_name}-internal-traffic" network = var.vpc_name project = local.vpc_project_id - description = "Allow internal traffic to runner instances" + description = "Allow service traffic to runner instances (SSH excluded — use IAP)" allow { protocol = "tcp" - ports = ["22", "8080", "9091", "4430"] + ports = ["8080", "9091", "4430"] } source_ranges = ["0.0.0.0/0"] target_tags = ["gitpod-runner"] } +# Allow IAP TCP forwarding to runner instances for operator SSH access +resource "google_compute_firewall" "allow_iap_to_runner" { + name = "${var.runner_name}-allow-iap-to-runner" + network = var.vpc_name + project = local.vpc_project_id + + description = "Allow IAP TCP forwarding to runner instances for SSH" + + allow { + protocol = "tcp" + ports = ["22"] + } + + source_ranges = ["35.235.240.0/20"] + target_tags = ["gitpod-runner"] +} + # Basic firewall rules for SSH and health checks # Specific proxy and runner communication rules are defined in main.tf after proxy deployment @@ -289,23 +321,41 @@ resource "google_compute_firewall" "proxy_health_check" { target_tags = ["gitpod-proxy"] } -# Firewall rule to allow HTTP/HTTPS traffic +# Firewall rule to allow HTTP/HTTPS traffic to proxy instances. +# SSH (22) is intentionally excluded — operator SSH should go through IAP. resource "google_compute_firewall" "proxy_web_traffic" { name = "${var.runner_name}-proxy-web-traffic" network = var.vpc_name project = local.vpc_project_id - description = "Allow HTTP/HTTPS traffic to proxy instances" + description = "Allow HTTP/HTTPS traffic to proxy instances (SSH excluded — use IAP)" allow { protocol = "tcp" - ports = ["22", "8080", "8443"] + ports = ["8080", "8443"] } source_ranges = ["0.0.0.0/0"] target_tags = ["gitpod-proxy"] } +# Allow IAP TCP forwarding to proxy instances for operator SSH access +resource "google_compute_firewall" "allow_iap_to_proxy" { + name = "${var.runner_name}-allow-iap-to-proxy" + network = var.vpc_name + project = local.vpc_project_id + + description = "Allow IAP TCP forwarding to proxy instances for SSH" + + allow { + protocol = "tcp" + ports = ["22"] + } + + source_ranges = ["35.235.240.0/20"] + target_tags = ["gitpod-proxy"] +} + data "google_compute_subnetwork" "runner_subnet" { name = var.runner_subnet_name region = var.region @@ -392,6 +442,10 @@ resource "google_compute_firewall" "deny_proxy_to_environments_ssh_egress" { destination_ranges = [data.google_compute_subnetwork.runner_subnet.ip_cidr_range] target_tags = ["gitpod-proxy"] + + log_config { + metadata = "INCLUDE_ALL_METADATA" + } } # Allow proxy egress to environments on application ports only @@ -441,12 +495,20 @@ resource "google_compute_firewall" "allow_runner_to_environments_egress" { } # Allow environments to access outside the network (egress) +# +# UDP egress is intentionally restricted to a small set of ports rather than +# wide open. This blocks arbitrary UDP-based data exfiltration and reflection +# /amplification vectors while preserving the developer workflows that +# legitimately need UDP: +# - 53 (DNS) - name resolution for apt, git, npm, docker pull, etc. +# - 123 (NTP) - clock sync; required for TLS, JWT, git-over-HTTPS +# - 443 (QUIC) - HTTP/3 to Google, Cloudflare, GitHub, registries, ... resource "google_compute_firewall" "allow_environments_internet_egress" { name = "${var.runner_name}-allow-env-internet-egress" network = var.vpc_name project = local.vpc_project_id - description = "Allow environments to access outside the network" + description = "Allow environments to access outside the network (UDP restricted to DNS/NTP/QUIC)" direction = "EGRESS" priority = 1000 @@ -456,6 +518,7 @@ resource "google_compute_firewall" "allow_environments_internet_egress" { allow { protocol = "udp" + ports = ["53", "123", "443"] } allow { @@ -464,4 +527,8 @@ resource "google_compute_firewall" "allow_environments_internet_egress" { destination_ranges = ["0.0.0.0/0"] target_tags = ["gitpod-type-environment"] + + log_config { + metadata = "INCLUDE_ALL_METADATA" + } } diff --git a/iam.tf b/iam.tf index b89b012..c8e10a3 100644 --- a/iam.tf +++ b/iam.tf @@ -120,6 +120,7 @@ resource "google_project_iam_custom_role" "runner" { "compute.instances.list", "compute.instances.start", "compute.instances.stop", + "compute.instances.resume", "compute.instances.setLabels", "compute.instances.setMetadata", "compute.instances.setTags", @@ -206,11 +207,16 @@ resource "google_project_iam_custom_role" "runner" { "pubsub.topics.get", "pubsub.topics.list", - # IAM permissions for service account management - "iam.serviceAccounts.actAs", - "iam.serviceAccounts.getIamPolicy", - "iam.serviceAccounts.setIamPolicy", - "iam.serviceAccounts.getAccessToken", + # IAM permissions for service account management. + # actAs (roles/iam.serviceAccountUser) is granted per-SA via + # google_service_account_iam_member resources below — only on the + # runner, environment_vm, and proxy_vm SAs the runner needs to attach + # to instances and instance templates. + # + # getIamPolicy/setIamPolicy on service accounts are granted via the + # runner_sa_iam_manager custom role, bound per-SA on the same three + # SAs — not at project level — so the runner cannot modify IAM on + # unrelated service accounts in the project. # Instance template permissions for runner control plane "compute.instanceTemplates.create", @@ -226,11 +232,21 @@ resource "google_project_iam_custom_role" "runner" { "compute.instanceGroupManagers.create", "compute.instanceGroupManagers.delete", "compute.instanceGroupManagers.update", + "compute.instanceGroupManagers.use", # Instance group permissions required for MIG operations "compute.instanceGroups.delete", "compute.instanceGroups.list", + # Autoscaler permissions for dynamic warm pool scaling + "compute.autoscalers.create", + "compute.autoscalers.delete", + "compute.autoscalers.get", + "compute.autoscalers.update", + + # Cloud Monitoring permissions for publishing warm pool scaling metrics + "monitoring.timeSeries.create", + # Cloud Logging permissions for environment and prebuild log persistence "logging.logEntries.list", # Read environment logs from Cloud Logging "logging.logEntries.create", # Write prebuild logs to Cloud Logging @@ -296,13 +312,17 @@ resource "google_secret_manager_secret_iam_member" "runner_cp_certificate_secret } # GCS access for runner assets bucket (runner VMs) +# objectAdmin is required because the runner reads trust bundles and writes +# metrics audit payloads (managed metrics) to this bucket. resource "google_storage_bucket_iam_member" "runner_runner_assets_access" { bucket = google_storage_bucket.runner_assets.name - role = "roles/storage.objectViewer" + role = "roles/storage.objectAdmin" member = "serviceAccount:${local.runner_sa_email}" } # GCS access for agent storage bucket (runner VMs) +# objectAdmin is required because the runner deletes conversation, blob, and +# result objects during agent execution cleanup (objectUser lacks delete). resource "google_storage_bucket_iam_member" "runner_agent_storage_access" { count = var.enable_agents ? 1 : 0 @@ -332,68 +352,6 @@ resource "google_project_iam_member" "env_vm_artifact_registry" { # Logging and monitoring permissions consolidated below -# 3. BUILD CACHE SERVICE ACCOUNT -# Dedicated for GCS build cache operations -resource "google_service_account" "build_cache" { - count = var.pre_created_service_accounts.build_cache == "" ? 1 : 0 - - account_id = "${var.runner_name}-build-cache" - display_name = "Ona Build Cache" - description = "Service account for GCS build cache operations" - project = var.project_id -} - -# Logging permissions consolidated below - -# Allow runner to generate tokens for build cache service account -resource "google_service_account_iam_member" "runner_generate_build_cache_tokens" { - count = !local.using_pre_created_service_accounts && local.runner_sa_email != "" ? 1 : 0 - service_account_id = local.build_cache_sa_name - role = "roles/iam.serviceAccountTokenCreator" - member = "serviceAccount:${local.runner_sa_email}" -} - -# 4. SECRET MANAGEMENT SERVICE ACCOUNT -# For environment-specific secrets with scoped access -resource "google_service_account" "secret_manager" { - count = var.pre_created_service_accounts.secret_manager == "" ? 1 : 0 - - account_id = "${var.runner_name}-secrets" - display_name = "Ona Secret Manager" - description = "Service account for environment secret management" - project = var.project_id -} - -# Logging permissions consolidated below - -# Custom role for secret management -resource "google_project_iam_custom_role" "secret_manager" { - count = var.pre_created_service_accounts.secret_manager == "" ? 1 : 0 - - role_id = "${replace(var.runner_name, "-", "_")}_secret_manager" - title = "Ona Secret Manager" - description = "Scoped permissions for environment secret management" - project = var.project_id - - permissions = [ - "secretmanager.secrets.create", - "secretmanager.secrets.delete", - "secretmanager.secrets.get", - "secretmanager.secrets.list", - "secretmanager.versions.access", - "secretmanager.versions.add", - "secretmanager.versions.destroy" - ] -} - -# Allow runner to generate tokens for secret manager -resource "google_service_account_iam_member" "runner_generate_secret_tokens" { - count = !local.using_pre_created_service_accounts && local.runner_sa_email != "" ? 1 : 0 - service_account_id = local.secret_manager_sa_name - role = "roles/iam.serviceAccountTokenCreator" - member = "serviceAccount:${local.runner_sa_email}" -} - # Allow runner control plane to access the Redis credentials secret resource "google_secret_manager_secret_iam_member" "runner_cp_redis_secret_access" { project = var.project_id @@ -409,26 +367,7 @@ resource "google_project_iam_member" "runner_secret_version_manager" { member = "serviceAccount:${local.runner_sa_email}" } -# 5. PUB/SUB EVENT PROCESSING SERVICE ACCOUNT -# For event-driven reconciliation -resource "google_service_account" "pubsub_processor" { - count = var.pre_created_service_accounts.pubsub_processor == "" ? 1 : 0 - account_id = "${var.runner_name}-pubsub" - display_name = "Ona Pub/Sub Processor" - description = "Service account for processing Pub/Sub compute events" - project = var.project_id -} - -# Logging and monitoring permissions consolidated below - -# Allow runner to use Pub/Sub processor for event handling -resource "google_service_account_iam_member" "runner_use_pubsub_processor" { - count = !local.using_pre_created_service_accounts && local.runner_sa_email != "" ? 1 : 0 - service_account_id = local.pubsub_processor_sa_name - role = "roles/iam.serviceAccountUser" - member = "serviceAccount:${local.runner_sa_email}" -} # RUNNER TOKEN SECRET MANAGER RESOURCE # Create a secret for storing the runner token securely @@ -520,26 +459,7 @@ resource "google_project_iam_member" "env_vm_logging" { member = "serviceAccount:${local.environment_vm_sa_email}" } -resource "google_project_iam_member" "build_cache_logging" { - count = !local.using_pre_created_service_accounts && local.build_cache_sa_email != "" ? 1 : 0 - project = var.project_id - role = "roles/logging.logWriter" - member = "serviceAccount:${local.build_cache_sa_email}" -} - -resource "google_project_iam_member" "secret_manager_logging" { - count = !local.using_pre_created_service_accounts && local.secret_manager_sa_email != "" ? 1 : 0 - project = var.project_id - role = "roles/logging.logWriter" - member = "serviceAccount:${local.secret_manager_sa_email}" -} -resource "google_project_iam_member" "pubsub_processor_logging" { - count = !local.using_pre_created_service_accounts && local.pubsub_processor_sa_email != "" ? 1 : 0 - project = var.project_id - role = "roles/logging.logWriter" - member = "serviceAccount:${local.pubsub_processor_sa_email}" -} # Monitoring permissions - individual members for each service account resource "google_project_iam_member" "runner_cp_monitoring" { @@ -556,13 +476,6 @@ resource "google_project_iam_member" "env_vm_monitoring" { member = "serviceAccount:${local.environment_vm_sa_email}" } -resource "google_project_iam_member" "pubsub_processor_monitoring" { - count = !local.using_pre_created_service_accounts && local.pubsub_processor_sa_email != "" ? 1 : 0 - project = var.project_id - role = "roles/monitoring.metricWriter" - member = "serviceAccount:${local.pubsub_processor_sa_email}" -} - # Service account for proxy VMs resource "google_service_account" "proxy_vm" { count = var.pre_created_service_accounts.proxy_vm == "" ? 1 : 0 @@ -660,9 +573,10 @@ resource "google_secret_manager_secret_iam_member" "proxy_vm_certificate_secret_ } # GCS access for runner assets bucket (proxy VMs) +# objectAdmin is required for the metrics audit receiver to write audit payloads. resource "google_storage_bucket_iam_member" "proxy_vm_runner_assets_access" { bucket = google_storage_bucket.runner_assets.name - role = "roles/storage.objectViewer" + role = "roles/storage.objectAdmin" member = "serviceAccount:${local.proxy_vm_sa_email}" } @@ -760,34 +674,125 @@ resource "google_kms_crypto_key_iam_member" "environment_vm_kms_access" { member = "serviceAccount:${local.environment_vm_sa_email}" } -resource "google_kms_crypto_key_iam_member" "build_cache_kms_access" { - count = (var.create_cmek || var.kms_key_name != null) && local.build_cache_sa_email != "" ? 1 : 0 +resource "google_kms_crypto_key_iam_member" "proxy_vm_kms_access" { + count = (var.create_cmek || var.kms_key_name != null) && local.proxy_vm_sa_email != "" ? 1 : 0 crypto_key_id = local.kms_key_name role = "roles/cloudkms.cryptoKeyEncrypterDecrypter" - member = "serviceAccount:${local.build_cache_sa_email}" + member = "serviceAccount:${local.proxy_vm_sa_email}" } -resource "google_kms_crypto_key_iam_member" "secret_manager_kms_access" { - count = (var.create_cmek || var.kms_key_name != null) && local.secret_manager_sa_email != "" ? 1 : 0 +# ================================ +# RUNNER actAs BINDINGS (PER-SA) +# ================================ +# Grants the runner SA roles/iam.serviceAccountUser on the three SAs it +# attaches to instances and instance templates: +# - environment_vm_sa: attached to environment VMs. +# - proxy_vm_sa: attached to proxy VM instance templates. +# - runner_sa (self): attached to runner VM instance templates. +# +# Scoped per-SA so the runner cannot impersonate unrelated service +# accounts in the project. +# +# When pre_created_service_accounts is set the operator is responsible +# for granting roles/iam.serviceAccountUser on the relevant SAs to the +# runner SA out of band; the module does not manage IAM on SAs it did +# not create. +resource "google_service_account_iam_member" "runner_actas_runner" { + count = !local.using_pre_created_service_accounts && local.runner_sa_email != "" ? 1 : 0 - crypto_key_id = local.kms_key_name - role = "roles/cloudkms.cryptoKeyEncrypterDecrypter" - member = "serviceAccount:${local.secret_manager_sa_email}" + service_account_id = local.runner_sa_name + role = "roles/iam.serviceAccountUser" + member = "serviceAccount:${local.runner_sa_email}" + + depends_on = [google_service_account.runner] } -resource "google_kms_crypto_key_iam_member" "pubsub_processor_kms_access" { - count = (var.create_cmek || var.kms_key_name != null) && local.pubsub_processor_sa_email != "" ? 1 : 0 +resource "google_service_account_iam_member" "runner_actas_environment_vm" { + count = !local.using_pre_created_service_accounts && local.runner_sa_email != "" && local.environment_vm_sa_email != "" ? 1 : 0 - crypto_key_id = local.kms_key_name - role = "roles/cloudkms.cryptoKeyEncrypterDecrypter" - member = "serviceAccount:${local.pubsub_processor_sa_email}" + service_account_id = local.environment_vm_sa_name + role = "roles/iam.serviceAccountUser" + member = "serviceAccount:${local.runner_sa_email}" + + depends_on = [ + google_service_account.runner, + google_service_account.environment_vm, + ] } -resource "google_kms_crypto_key_iam_member" "proxy_vm_kms_access" { - count = (var.create_cmek || var.kms_key_name != null) && local.proxy_vm_sa_email != "" ? 1 : 0 +resource "google_service_account_iam_member" "runner_actas_proxy_vm" { + count = !local.using_pre_created_service_accounts && local.runner_sa_email != "" && local.proxy_vm_sa_email != "" ? 1 : 0 - crypto_key_id = local.kms_key_name - role = "roles/cloudkms.cryptoKeyEncrypterDecrypter" - member = "serviceAccount:${local.proxy_vm_sa_email}" + service_account_id = local.proxy_vm_sa_name + role = "roles/iam.serviceAccountUser" + member = "serviceAccount:${local.runner_sa_email}" + + depends_on = [ + google_service_account.runner, + google_service_account.proxy_vm, + ] +} + +# ================================ +# RUNNER SA IAM MANAGER (PER-SA) +# ================================ +# Grants the runner SA getIamPolicy/setIamPolicy on the same three SAs it +# manages (runner, environment_vm, proxy_vm), via a small custom role +# bound at the service-account level. This replaces holding +# iam.serviceAccounts.{get,set}IamPolicy in the runner's project-level +# custom role, which would otherwise let the runner manage IAM on every +# service account in the project. +# +# When pre_created_service_accounts is set the operator is responsible +# for granting equivalent IAM-management permissions on the relevant SAs +# out of band; the module does not manage IAM on SAs it did not create. +resource "google_project_iam_custom_role" "runner_sa_iam_manager" { + count = var.pre_created_service_accounts.runner == "" ? 1 : 0 + + role_id = "${replace(var.runner_name, "-", "_")}_sa_iam_mgr" + title = "Ona Runner SA IAM Manager" + description = "Manage IAM policies on the runner-owned service accounts only" + project = var.project_id + + permissions = [ + "iam.serviceAccounts.getIamPolicy", + "iam.serviceAccounts.setIamPolicy", + ] +} + +resource "google_service_account_iam_member" "runner_manage_runner_iam" { + count = !local.using_pre_created_service_accounts && local.runner_sa_email != "" ? 1 : 0 + + service_account_id = local.runner_sa_name + role = google_project_iam_custom_role.runner_sa_iam_manager[0].id + member = "serviceAccount:${local.runner_sa_email}" + + depends_on = [google_service_account.runner] +} + +resource "google_service_account_iam_member" "runner_manage_environment_vm_iam" { + count = !local.using_pre_created_service_accounts && local.runner_sa_email != "" && local.environment_vm_sa_email != "" ? 1 : 0 + + service_account_id = local.environment_vm_sa_name + role = google_project_iam_custom_role.runner_sa_iam_manager[0].id + member = "serviceAccount:${local.runner_sa_email}" + + depends_on = [ + google_service_account.runner, + google_service_account.environment_vm, + ] +} + +resource "google_service_account_iam_member" "runner_manage_proxy_vm_iam" { + count = !local.using_pre_created_service_accounts && local.runner_sa_email != "" && local.proxy_vm_sa_email != "" ? 1 : 0 + + service_account_id = local.proxy_vm_sa_name + role = google_project_iam_custom_role.runner_sa_iam_manager[0].id + member = "serviceAccount:${local.runner_sa_email}" + + depends_on = [ + google_service_account.runner, + google_service_account.proxy_vm, + ] } diff --git a/locals.tf b/locals.tf index 789a32c..0d2a212 100644 --- a/locals.tf +++ b/locals.tf @@ -1,4 +1,6 @@ locals { + module_version = trimspace(file("${path.module}/VERSION")) + # VPC project ID - defaults to project_id if not specified (for Shared VPC support) vpc_project_id = var.vpc_project_id != "" ? var.vpc_project_id : var.project_id @@ -16,11 +18,11 @@ locals { }) # Default images - default_runner_image = "us-docker.pkg.dev/gitpod-next-production/gitpod-next/gitpod-gcp-runner:20260408.1349" - default_proxy_image = "us-docker.pkg.dev/gitpod-next-production/gitpod-next/gitpod-proxy:20260408.1349" + default_runner_image = "us-docker.pkg.dev/gitpod-next-production/gitpod-next/gitpod-gcp-runner:20260513.1150" + default_proxy_image = "us-docker.pkg.dev/gitpod-next-production/gitpod-next/gitpod-proxy:20260513.1150" - default_prometheus_image = "us-docker.pkg.dev/gitpod-next-production/gitpod-next/prometheus:v3.5.0" - default_node_exporter_image = "us-docker.pkg.dev/gitpod-next-production/gitpod-next/node-exporter:v1.9.1" + default_prometheus_image = "us-docker.pkg.dev/gitpod-next-production/gitpod-next/prometheus:v3.11.3" + default_node_exporter_image = "us-docker.pkg.dev/gitpod-next-production/gitpod-next/node-exporter:v1.11.1" # Final images (custom or default) runner_image = var.custom_images.runner_image != "" ? var.custom_images.runner_image : local.default_runner_image @@ -30,6 +32,39 @@ locals { runner_dev_image = var.development_version != "" ? "us-docker.pkg.dev/gitpod-next-production/gitpod-next/gitpod-gcp-runner:${var.development_version}" : local.runner_image + # Container resource limits derived from VM machine type. + # GCP standard machine types follow the pattern {family}-standard-{vcpus} + # with memory = vcpus * 4 GB. We reserve ~25% for the host OS and Docker + # daemon, then allocate the rest across containers. + # + # Aligned with EC2 Fargate runner limits: + # EC2 small: 1 vCPU / 3 GB task → runner gets ~1 GB + # EC2 large: 8 vCPU / 16 GB task → runner gets ~14 GB + # GCP small: 2 vCPU / 8 GB VM → runner gets 5 GB / 1.25 CPU + # GCP regular: 4 vCPU / 16 GB VM → runner gets 12 GB / 3 CPU + + runner_vcpus = tonumber(regex("-(\\d+)$", var.runner_vm_config.machine_type)[0]) + runner_memory_gb = local.runner_vcpus * 4 + + # Sidecar limits are fixed (small footprint). The main runner container + # gets whatever remains after sidecars and OS overhead. + runner_sidecar_memory_mb = 1792 # prometheus 1024 + auth-proxy 512 + node-exporter 256 + runner_sidecar_cpus = 1.25 # prometheus 0.5 + auth-proxy 0.5 + node-exporter 0.25 + runner_os_reserve_mb = 512 + + runner_container_memory_mb = (local.runner_memory_gb * 1024) - local.runner_sidecar_memory_mb - local.runner_os_reserve_mb + runner_container_cpus = local.runner_vcpus - local.runner_sidecar_cpus + + proxy_vcpus = tonumber(regex("-(\\d+)$", var.proxy_vm_config.machine_type)[0]) + proxy_memory_gb = local.proxy_vcpus * 4 + + proxy_sidecar_memory_mb = 768 # prometheus 512 + node-exporter 256 + proxy_sidecar_cpus = 0.5 # prometheus 0.25 + node-exporter 0.25 + proxy_os_reserve_mb = 512 + + proxy_container_memory_mb = (local.proxy_memory_gb * 1024) - local.proxy_sidecar_memory_mb - local.proxy_os_reserve_mb + proxy_container_cpus = local.proxy_vcpus - local.proxy_sidecar_cpus + # Docker config handling docker_config_enabled = var.custom_images.docker_config_json != "" docker_config_bucket_name = local.docker_config_enabled ? google_storage_bucket.runner_assets.name : "" @@ -56,22 +91,21 @@ locals { logs_url = "https://console.cloud.google.com/logs/query;query=resource.type%%3D%%22gce_instance%%22%%0A%%2528jsonPayload.%%22cos.googleapis.com%%2Fcontainer_name%%22%%3D%%22gitpod-runner%%22%%20OR%%20jsonPayload.%%22cos.googleapis.com%%2Fcontainer_name%%22%%3D%%22gitpod-proxy%%22%%2529;duration=PT3H?project=${var.project_id}" # Determine if we're using pre-created service accounts (IAM team handles high-privilege operations) + # Only considers the 3 active service accounts (runner, environment_vm, proxy_vm). using_pre_created_service_accounts = anytrue([ - for sa in values(var.pre_created_service_accounts) : sa != "" + var.pre_created_service_accounts.runner != "", + var.pre_created_service_accounts.environment_vm != "", + var.pre_created_service_accounts.proxy_vm != "", ]) # Service account emails (either created or pre-created) - shared across all files - runner_sa_email = var.pre_created_service_accounts.runner != "" ? var.pre_created_service_accounts.runner : try(google_service_account.runner[0].email, "") - environment_vm_sa_email = var.pre_created_service_accounts.environment_vm != "" ? var.pre_created_service_accounts.environment_vm : try(google_service_account.environment_vm[0].email, "") - build_cache_sa_email = var.pre_created_service_accounts.build_cache != "" ? var.pre_created_service_accounts.build_cache : try(google_service_account.build_cache[0].email, "") - secret_manager_sa_email = var.pre_created_service_accounts.secret_manager != "" ? var.pre_created_service_accounts.secret_manager : try(google_service_account.secret_manager[0].email, "") - pubsub_processor_sa_email = var.pre_created_service_accounts.pubsub_processor != "" ? var.pre_created_service_accounts.pubsub_processor : try(google_service_account.pubsub_processor[0].email, "") - proxy_vm_sa_email = var.pre_created_service_accounts.proxy_vm != "" ? var.pre_created_service_accounts.proxy_vm : try(google_service_account.proxy_vm[0].email, "") + runner_sa_email = var.pre_created_service_accounts.runner != "" ? var.pre_created_service_accounts.runner : try(google_service_account.runner[0].email, "") + environment_vm_sa_email = var.pre_created_service_accounts.environment_vm != "" ? var.pre_created_service_accounts.environment_vm : try(google_service_account.environment_vm[0].email, "") + proxy_vm_sa_email = var.pre_created_service_accounts.proxy_vm != "" ? var.pre_created_service_accounts.proxy_vm : try(google_service_account.proxy_vm[0].email, "") # Service account names for IAM bindings (full resource names) - runner_sa_name = var.pre_created_service_accounts.runner != "" ? "projects/${var.project_id}/serviceAccounts/${var.pre_created_service_accounts.runner}" : try(google_service_account.runner[0].name, "") - build_cache_sa_name = var.pre_created_service_accounts.build_cache != "" ? "projects/${var.project_id}/serviceAccounts/${var.pre_created_service_accounts.build_cache}" : try(google_service_account.build_cache[0].name, "") - secret_manager_sa_name = var.pre_created_service_accounts.secret_manager != "" ? "projects/${var.project_id}/serviceAccounts/${var.pre_created_service_accounts.secret_manager}" : try(google_service_account.secret_manager[0].name, "") - pubsub_processor_sa_name = var.pre_created_service_accounts.pubsub_processor != "" ? "projects/${var.project_id}/serviceAccounts/${var.pre_created_service_accounts.pubsub_processor}" : try(google_service_account.pubsub_processor[0].name, "") + runner_sa_name = var.pre_created_service_accounts.runner != "" ? "projects/${var.project_id}/serviceAccounts/${var.pre_created_service_accounts.runner}" : try(google_service_account.runner[0].name, "") + environment_vm_sa_name = var.pre_created_service_accounts.environment_vm != "" ? "projects/${var.project_id}/serviceAccounts/${var.pre_created_service_accounts.environment_vm}" : try(google_service_account.environment_vm[0].name, "") + proxy_vm_sa_name = var.pre_created_service_accounts.proxy_vm != "" ? "projects/${var.project_id}/serviceAccounts/${var.pre_created_service_accounts.proxy_vm}" : try(google_service_account.proxy_vm[0].name, "") } diff --git a/main.tf b/main.tf new file mode 100644 index 0000000..e69de29 diff --git a/modules/custom-domain-client-infra/README.md b/modules/custom-domain-client-infra/README.md index 5a6768f..a818ed9 100644 --- a/modules/custom-domain-client-infra/README.md +++ b/modules/custom-domain-client-infra/README.md @@ -17,6 +17,7 @@ This Terraform module deploys the customer-side infrastructure required to conne 3. Validation Result ├─ Valid: Forwards request to backend URL └─ Invalid: Returns 401 Unauthorized +``` ## Prerequisites diff --git a/proxy-vm.tf b/proxy-vm.tf index 6506972..c6a11c4 100644 --- a/proxy-vm.tf +++ b/proxy-vm.tf @@ -10,17 +10,18 @@ data "cloudinit_config" "proxy" { part { content_type = "text/cloud-config" content = templatefile("${path.module}/files/proxy-cloud-init.tftpl", { - RUNNER_ID = var.runner_id - PROJECT_ID = var.project_id - REGION = var.region - PROXY_DOMAIN = var.runner_domain - PROXY_IMAGE_URL = local.proxy_image - PROMETHEUS_IMAGE = local.prometheus_image - LOADBALANCER_TYPE = var.loadbalancer_type - CERTIFICATE_ID = var.certificate_id - CERTIFICATE_SECRET_ID = var.certificate_secret_id - METRICS_SECRET_ID = "${var.runner_id}-metrics" - API_ENDPOINT = var.api_endpoint + RUNNER_ID = var.runner_id + PROJECT_ID = var.project_id + REGION = var.region + PROXY_DOMAIN = var.runner_domain + PROXY_IMAGE_URL = local.proxy_image + PROMETHEUS_IMAGE = local.prometheus_image + LOADBALANCER_TYPE = var.loadbalancer_type + CERTIFICATE_ID = var.certificate_id + CERTIFICATE_SECRET_ID = var.certificate_secret_id + METRICS_SECRET_ID = "${var.runner_id}-metrics" + API_ENDPOINT = var.api_endpoint + RUNNER_ASSETS_BUCKET_NAME = google_storage_bucket.runner_assets.name # Proxy configuration HTTP_PROXY = local.http_proxy HTTPS_PROXY = local.https_proxy @@ -37,6 +38,9 @@ data "cloudinit_config" "proxy" { # Insecure registries configuration INSECURE_REGISTRIES_ENABLED = local.insecure_registries_enabled INSECURE_REGISTRIES_JSON = local.insecure_registries_json + # Container resource limits (computed from machine type) + PROXY_CONTAINER_MEMORY = "${local.proxy_container_memory_mb}m" + PROXY_CONTAINER_CPUS = tostring(local.proxy_container_cpus) }) } } @@ -48,7 +52,7 @@ resource "google_compute_instance_template" "proxy" { machine_type = var.proxy_vm_config.machine_type region = var.region - tags = ["gitpod-proxy", "gitpod-type-proxy"] + tags = ["gitpod-proxy", "gitpod-type-proxy", "allow-health-check"] labels = local.proxy_labels @@ -87,7 +91,9 @@ resource "google_compute_instance_template" "proxy" { } shielded_instance_config { - enable_secure_boot = true + enable_secure_boot = true + enable_vtpm = true + enable_integrity_monitoring = true } # Container-Optimized OS metadata for running the proxy container @@ -96,9 +102,10 @@ resource "google_compute_instance_template" "proxy" { google-monitoring-enabled = "true" google-logging-use-fluentbit = "true" serial-port-logging-enable = "true" + block-project-ssh-keys = "TRUE" # Cloud-init configuration for proxy setup - user-data = data.cloudinit_config.proxy.rendered + user-data = sensitive(data.cloudinit_config.proxy.rendered) "cos-metrics-enabled" = "true" } diff --git a/runner-vm.tf b/runner-vm.tf index ec5c8c9..472a2ab 100644 --- a/runner-vm.tf +++ b/runner-vm.tf @@ -36,9 +36,9 @@ resource "tls_private_key" "auth_proxy" { algorithm = "RSA" rsa_bits = 2048 - # Force recreation when rotation time changes lifecycle { create_before_destroy = true + replace_triggered_by = [time_rotating.auth_proxy_cert_rotation] } } @@ -52,9 +52,9 @@ resource "tls_self_signed_cert" "auth_proxy" { validity_period_hours = 8760 # 1 year - # Force recreation when rotation time changes lifecycle { create_before_destroy = true + replace_triggered_by = [time_rotating.auth_proxy_cert_rotation] } allowed_uses = [ @@ -139,8 +139,8 @@ data "cloudinit_config" "runner" { METRICS_SECRET_ID = "${var.runner_id}-metrics" ENABLE_AGENTS = var.enable_agents AGENT_BUCKET_NAME = local.agent_bucket_name - HONEYCOMB_API_KEY = var.honeycomb_api_key - MIG_WARM_POOL_ENABLED = var.mig_warm_pool_enabled + RUNNER_ASSETS_BUCKET_NAME = google_storage_bucket.runner_assets.name + MIG_WARM_POOL_ENABLED = true # Proxy configuration HTTP_PROXY = local.http_proxy HTTPS_PROXY = local.https_proxy @@ -165,6 +165,11 @@ data "cloudinit_config" "runner" { CUSTOM_RUNNER_REGISTRY = local.custom_runner_registry # Environment VM labels configuration ENVIRONMENT_VM_LABELS = join(",", [for k, v in var.labels : "${k}=${v}"]) + # Module version reported to the management plane + TERRAFORM_MODULE_VERSION = local.module_version + # Container resource limits (computed from machine type) + RUNNER_CONTAINER_MEMORY = "${local.runner_container_memory_mb}m" + RUNNER_CONTAINER_CPUS = tostring(local.runner_container_cpus) }) } } @@ -197,7 +202,9 @@ resource "google_compute_instance_template" "runner" { } shielded_instance_config { - enable_secure_boot = true + enable_secure_boot = true + enable_vtpm = true + enable_integrity_monitoring = true } @@ -226,9 +233,10 @@ resource "google_compute_instance_template" "runner" { google-monitoring-enabled = "true" google-logging-use-fluentbit = "true" serial-port-logging-enable = "true" + block-project-ssh-keys = "TRUE" # Cloud-init configuration for Prometheus setup - user-data = data.cloudinit_config.runner.rendered + user-data = sensitive(data.cloudinit_config.runner.rendered) "cos-metrics-enabled" = "true" } @@ -350,8 +358,11 @@ resource "google_compute_health_check" "runner" { } -# Resource tagging for lifecycle management +# Project metadata: authoritative (default) or per-key items. +# Authoritative manages ALL project metadata — keys not in this block are removed. +# Per-key items only touch the keys this module needs, leaving other metadata intact. resource "google_compute_project_metadata" "runner_metadata" { + count = var.use_authoritative_project_metadata ? 1 : 0 project = var.project_id metadata = { @@ -359,3 +370,17 @@ resource "google_compute_project_metadata" "runner_metadata" { "gitpod-runner-id" = var.runner_id } } + +resource "google_compute_project_metadata_item" "enable_oslogin" { + count = var.use_authoritative_project_metadata ? 0 : 1 + project = var.project_id + key = "enable-oslogin" + value = "TRUE" +} + +resource "google_compute_project_metadata_item" "runner_id" { + count = var.use_authoritative_project_metadata ? 0 : 1 + project = var.project_id + key = "gitpod-runner-id" + value = var.runner_id +} diff --git a/storage.tf b/storage.tf index 6aa0f04..8727165 100644 --- a/storage.tf +++ b/storage.tf @@ -264,7 +264,9 @@ locals { has_certificates = var.ca_certificate != null || (var.certificate_secret_id != "" && var.certificate_secret_read) } -# Upload combined trust bundle certificate to GCS bucket +# Upload combined trust bundle certificate to GCS bucket. +# create_before_destroy ensures the new object is written before the old +# one is removed, preventing a gap if terraform apply is interrupted. resource "google_storage_bucket_object" "trust_bundle" { count = local.has_certificates ? 1 : 0 @@ -284,6 +286,10 @@ resource "google_storage_bucket_object" "trust_bundle" { has_ca_cert = var.ca_certificate != null ? "true" : "false" has_secret_cert = var.certificate_secret_id != "" && var.certificate_secret_read ? "true" : "false" } + + lifecycle { + create_before_destroy = true + } } # Upload Docker config.json to GCS bucket if provided diff --git a/tests/e2e/README.md b/tests/e2e/README.md deleted file mode 100644 index 9d65ce6..0000000 --- a/tests/e2e/README.md +++ /dev/null @@ -1,193 +0,0 @@ -# GCP Runner End-to-End Tests - -End-to-end tests for the GCP Runner Terraform module that validate the complete lifecycle: - -1. Creates a runner via Ona API -2. Deploys infrastructure using Terraform -3. Waits for runner to come online -4. Cleans up all resources - -## Quick Start - -### Prerequisites - -- **Tools**: `gcloud`, `terraform`, `curl`, `jq` -- **GCP Project**: With billing enabled and required APIs -- **Ona Organization**: With admin access for runner management - -## GCP Project Setup - -### 1. Create and Configure GCP Project - -```bash -# Create a new project (or use existing) -export PROJECT_ID="gcp-runner-e2e-tests" -gcloud projects create $PROJECT_ID --name="GCP Runner E2E Tests" - -# Enable billing (required for compute resources) -gcloud billing projects link $PROJECT_ID --billing-account=YOUR_BILLING_ACCOUNT_ID - -# Set as default project -gcloud config set project $PROJECT_ID - -# Enable all required APIs for the Terraform deployment -# This enables all APIs that the runner infrastructure needs -gcloud services enable \ - cloudresourcemanager.googleapis.com \ - iam.googleapis.com \ - serviceusage.googleapis.com \ - compute.googleapis.com \ - dns.googleapis.com \ - certificatemanager.googleapis.com \ - secretmanager.googleapis.com \ - storage.googleapis.com \ - pubsub.googleapis.com \ - cloudfunctions.googleapis.com \ - monitoring.googleapis.com \ - logging.googleapis.com \ - redis.googleapis.com \ - run.googleapis.com \ - vpcaccess.googleapis.com \ - servicenetworking.googleapis.com \ - artifactregistry.googleapis.com \ - iamcredentials.googleapis.com \ - --project=$PROJECT_ID - -# Wait a moment for API enablement to propagate -echo "Waiting for APIs to be fully enabled..." -sleep 30 -``` - -### 2. Create Service Account - -```bash -# Create service account -gcloud iam service-accounts create gcp-runner-e2e-tests \ - --display-name="GCP Runner E2E Tests" \ - --description="Service account for GCP Runner E2E tests" - -# Get service account email -export SA_EMAIL="gcp-runner-e2e-tests@${PROJECT_ID}.iam.gserviceaccount.com" -``` - -### 3. Assign Required Roles - -```bash -# Main project roles -roles=( - "roles/compute.admin" - "roles/dns.admin" - "roles/iam.serviceAccountAdmin" - "roles/iam.serviceAccountUser" - "roles/iam.roleAdmin" - "roles/resourcemanager.projectIamAdmin" - "roles/certificatemanager.owner" - "roles/secretmanager.admin" - "roles/storage.admin" - "roles/pubsub.admin" - "roles/cloudfunctions.admin" - "roles/serviceusage.serviceUsageAdmin" - "roles/monitoring.admin" - "roles/logging.admin" - "roles/artifactregistry.admin" - "roles/servicenetworking.networksAdmin" - "roles/redis.admin" -) - -for role in "${roles[@]}"; do - gcloud projects add-iam-policy-binding $PROJECT_ID \ - --member="serviceAccount:$SA_EMAIL" \ - --role="$role" -done -``` - -### 4. Create and Download Service Account Key - -```bash -# Create key file -gcloud iam service-accounts keys create ~/gcp-runner-e2e-key.json \ - --iam-account=$SA_EMAIL - -# Set environment variable -export GOOGLE_APPLICATION_CREDENTIALS="$HOME/gcp-runner-e2e-key.json" -``` - -### 5. DNS Project Setup - -The E2E tests require access to the `dns-for-playgrounds` project to create DNS delegation records in the `tests-doptig-com` managed zone. This enables proper DNS resolution for the test runner domain (`$TEST_ID.tests.doptig.com`). - -```bash -# Grant DNS admin permissions to the dns-for-playgrounds project -gcloud projects add-iam-policy-binding dns-for-playgrounds \ - --member="serviceAccount:$SA_EMAIL" \ - --role="roles/dns.admin" - -# Verify access to the required DNS zone -gcloud dns managed-zones describe "tests-doptig-com" --project="dns-for-playgrounds" -``` - -**Important**: Without this setup, the E2E script will skip DNS delegation and show warnings, and **the tests will fail**. The runner cannot come online without proper DNS resolution because it needs to communicate with Gitpod's control plane via `https://${runner_id}.${runner_domain}/`. - -### Environment Setup - -```bash -# Required -export GCP_PROJECT_ID="gcp-runner-e2e-tests" -export GITPOD_TOKEN="your-organization-pat-token" -export GOOGLE_APPLICATION_CREDENTIALS="$HOME/gcp-runner-e2e-key.json" - -# Optional (with defaults) -export GCP_REGION="us-central1" # Default: us-central1 -export GITPOD_API_ENDPOINT="https://app.gitpod.io/api" # Default -export E2E_TEST_ID="my-test-$(date +%s)" # Auto-generated if not set -``` - -### Running Tests - -```bash -# Run the test -./tests/e2e/scripts/e2e-test.sh - -# Show help -./tests/e2e/scripts/e2e-test.sh --help -``` - -## GitHub Actions - -Runs daily at 6 AM UTC and can be triggered manually. - -### Required Secrets - -- `E2E_GITPOD_TOKEN` - Organization-specific Ona PAT -- `E2E_GOOGLE_APPLICATION_CREDENTIALS` - Service account key JSON -- `NEXT_ALERTS_SLACK_WEBHOOK` - Slack webhook for failure notifications - -### Optional Variables - -- `E2E_GCP_PROJECT_ID` - GCP project ID (defaults to gcp-runner-e2e-tests) -- `E2E_GCP_REGION` - GCP region (defaults to us-central1) -- `E2E_GITPOD_API_ENDPOINT` - API endpoint (defaults to https://app.gitpod.io/api) - -## Service Account Permissions - -The service account needs these roles in the **main GCP project**: -- `Compute Admin` - For VMs, disks, networks, and load balancers -- `DNS Administrator` - For DNS zones and records -- `Service Account Admin` - For creating and managing service accounts -- `Service Account User` - For using service accounts in compute resources -- `Role Administrator` - For creating and managing custom IAM roles -- `Project IAM Admin` - For IAM role bindings and custom roles -- `Certificate Manager Owner` - For SSL certificate management (includes delete permissions) -- `Secret Manager Admin` - For storing sensitive configuration -- `Storage Admin` - For build cache buckets -- `Pub/Sub Admin` - For event-driven reconciliation -- `Cloud Functions Admin` - For auth proxy functions -- `Service Usage Admin` - For enabling required APIs -- `Monitoring Admin` - For health checks and alerting -- `Logging Admin` - For log management -- `Artifact Registry Admin` - For container image repositories -- `Service Networking Networks Admin` - For VPC peering and private service access - -**DNS Project Permissions** (mandatory - tests will fail without this): -The service account also needs: -- `DNS Administrator` role in the **dns-for-playgrounds project** limited to the **tests-doptig-com zone** using IAM conditions diff --git a/tests/e2e/scripts/e2e-test.sh b/tests/e2e/scripts/e2e-test.sh deleted file mode 100755 index 3d6b95d..0000000 --- a/tests/e2e/scripts/e2e-test.sh +++ /dev/null @@ -1,585 +0,0 @@ -#!/bin/bash -# GCP Runner End-to-End Test -# -# This script tests the complete lifecycle of a GCP runner deployment: -# 1. Creates a runner via Ona API (using organization-specific PAT) -# 2. Deploys infrastructure using Terraform -# 3. Waits for runner to come online -# 4. Cleans up all resources -# -# Required: GCP_PROJECT_ID, GITPOD_TOKEN (org-specific PAT) -# Optional: GCP_REGION (defaults to us-central1) -# GITPOD_API_ENDPOINT (defaults to https://app.gitpod.io/api) -# GOOGLE_APPLICATION_CREDENTIALS (path to service account key) -# -# Usage: ./scripts/e2e-test.sh [--help] - -set -euo pipefail - -# Script directory -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -REPO_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)" -TERRAFORM_DIR="$REPO_ROOT/examples/runner-with-networking" - -# Exit codes -readonly EXIT_GENERAL_FAILURE=1 -readonly EXIT_CONFIG_ERROR=2 -readonly EXIT_AUTH_FAILURE=3 -readonly EXIT_INFRA_FAILURE=4 -readonly EXIT_CLEANUP_FAILURE=5 - -# Global variables for cleanup tracking -RUNNER_ID="" -TERRAFORM_APPLIED=false -TEST_ID="" -TFVARS_FILE="" - -# Logging functions -log() { - echo "[$(date +'%Y-%m-%d %H:%M:%S')] $*" >&2 -} - -log_info() { - log "INFO: $*" -} - -log_error() { - log "ERROR: $*" -} - -log_warn() { - log "WARN: $*" -} - -# Cleanup function - always runs on exit -cleanup() { - local original_exit_code=$? - local cleanup_exit_code=0 - log_info "Starting cleanup process..." - log_info "Original exit code: $original_exit_code" - - # Destroy Terraform infrastructure first (while runner is still available) - if [[ "$TERRAFORM_APPLIED" == "true" ]]; then - log_info "Destroying Terraform infrastructure..." - log_info "This includes any partially created resources from failed/timed-out apply" - if ! destroy_terraform; then - log_error "Failed to destroy Terraform infrastructure" - cleanup_exit_code=$EXIT_CLEANUP_FAILURE - fi - else - log_info "Skipping Terraform destroy - no apply was attempted" - fi - - # Clean up generated files - if [[ -n "$TFVARS_FILE" && -f "$TFVARS_FILE" ]]; then - log_info "Cleaning up tfvars file: $TFVARS_FILE" - rm -f "$TFVARS_FILE" - fi - - # Clean up delegation terraform file - local delegation_tf="$TERRAFORM_DIR/e2e-delegation.tf" - if [[ -f "$delegation_tf" ]]; then - log_info "Cleaning up delegation file: $delegation_tf" - rm -f "$delegation_tf" - fi - - - - # Delete runner last (after infrastructure is cleaned up) - if [[ -n "$RUNNER_ID" ]]; then - log_info "Deleting runner: $RUNNER_ID" - if ! delete_runner "$RUNNER_ID"; then - log_error "Failed to delete runner" - cleanup_exit_code=$EXIT_CLEANUP_FAILURE - fi - fi - - log_info "Cleanup completed" - - # Show final result message based on original exit code - if [[ $original_exit_code -eq 0 ]]; then - echo "" - echo "==========================================" - echo "🎉 E2E TEST COMPLETED SUCCESSFULLY!" - echo "==========================================" - echo "Test ID: $TEST_ID" - echo "All resources have been cleaned up." - echo "" - else - echo "" - echo "==========================================" - echo "❌ E2E TEST FAILED!" - echo "==========================================" - echo "Test ID: $TEST_ID" - echo "Exit code: $original_exit_code" - echo "Check the logs above for error details." - echo "" - case $original_exit_code in - "$EXIT_CONFIG_ERROR") - echo "Issue: Configuration or environment error" - echo "Fix: Check required environment variables and tool installations" - ;; - "$EXIT_AUTH_FAILURE") - echo "Issue: Authentication failure" - echo "Fix: Verify GITPOD_TOKEN has runner management permissions and you are org admin" - ;; - "$EXIT_INFRA_FAILURE") - echo "Issue: Infrastructure deployment failure" - echo "Fix: Check GCP quotas, permissions, and Terraform configuration" - ;; - "$EXIT_CLEANUP_FAILURE") - echo "Issue: Cleanup failure" - echo "Fix: Manual cleanup may be required - check GCP console for orphaned resources" - ;; - *) - echo "Issue: General failure" - echo "Fix: Check the error logs above for specific details" - ;; - esac - echo "" - fi - - # Show additional cleanup failure message if needed - if [[ $cleanup_exit_code -ne 0 ]]; then - echo "" - echo "⚠️ CLEANUP WARNING:" - echo "Cleanup failed during resource removal." - echo "Manual cleanup may be required - check GCP console for orphaned resources." - echo "" - fi - - exit $original_exit_code -} - -# Set up cleanup trap -trap cleanup EXIT - -# Environment validation -validate_environment() { - log_info "Validating environment..." - - local required_vars=( - "GCP_PROJECT_ID" - "GITPOD_TOKEN" - ) - - local missing_vars=() - for var in "${required_vars[@]}"; do - if [[ -z "${!var:-}" ]]; then - missing_vars+=("$var") - fi - done - - if [[ ${#missing_vars[@]} -gt 0 ]]; then - log_error "Missing required environment variables: ${missing_vars[*]}" - log_error "Please set all required variables before running the test" - exit $EXIT_CONFIG_ERROR - fi - - # Check required tools - local required_tools=("curl" "jq" "terraform" "gcloud") - local missing_tools=() - for tool in "${required_tools[@]}"; do - if ! command -v "$tool" >/dev/null 2>&1; then - missing_tools+=("$tool") - fi - done - - if [[ ${#missing_tools[@]} -gt 0 ]]; then - log_error "Missing required tools: ${missing_tools[*]}" - exit $EXIT_CONFIG_ERROR - fi - - # Set default API endpoint if not provided - GITPOD_API_ENDPOINT="${GITPOD_API_ENDPOINT:-https://app.gitpod.io/api}" - export GITPOD_API_ENDPOINT - log_info "Using Ona API endpoint: $GITPOD_API_ENDPOINT" - - # Set default GCP region if not provided - GCP_REGION="${GCP_REGION:-us-central1}" - export GCP_REGION - log_info "Using GCP region: $GCP_REGION" - - # Validate GCP authentication - validate_gcp_authentication - - # Generate test ID - TEST_ID="${E2E_TEST_ID:-e2e-$(date +%s)}" - log_info "Test ID: $TEST_ID" - - log_info "Environment validation completed" -} - -# GCP authentication validation -validate_gcp_authentication() { - log_info "Validating GCP authentication..." - - # Check if service account key file is provided - if [[ -n "${GOOGLE_APPLICATION_CREDENTIALS:-}" ]]; then - if [[ ! -f "$GOOGLE_APPLICATION_CREDENTIALS" ]]; then - log_error "Service account key file not found: $GOOGLE_APPLICATION_CREDENTIALS" - exit $EXIT_CONFIG_ERROR - fi - - log_info "Using service account key file: $GOOGLE_APPLICATION_CREDENTIALS" - - # Activate service account - if ! gcloud auth activate-service-account --key-file="$GOOGLE_APPLICATION_CREDENTIALS" --quiet; then - log_error "Failed to activate service account" - exit $EXIT_AUTH_FAILURE - fi - - log_info "Service account activated successfully" - else - log_info "No GOOGLE_APPLICATION_CREDENTIALS provided, using existing gcloud authentication" - fi - - # Set the GCP project - if ! gcloud config set project "$GCP_PROJECT_ID" --quiet; then - log_error "Failed to set GCP project: $GCP_PROJECT_ID" - exit $EXIT_CONFIG_ERROR - fi - - # Verify authentication by testing access to the project - if ! gcloud projects describe "$GCP_PROJECT_ID" --quiet >/dev/null 2>&1; then - log_error "Cannot access GCP project: $GCP_PROJECT_ID" - log_error "Please check:" - log_error " 1. Project exists and you have access" - log_error " 2. Service account has necessary permissions" - log_error " 3. Authentication is properly configured" - exit $EXIT_AUTH_FAILURE - fi - - log_info "GCP authentication validated for project: $GCP_PROJECT_ID" -} - -# Ona API functions -gitpod_api_call() { - local service="$1" - local method="$2" - local data="$3" - - local url="$GITPOD_API_ENDPOINT/gitpod.v1.$service/$method" - - curl -X POST \ - -H "Authorization: Bearer $GITPOD_TOKEN" \ - -H "Content-Type: application/json" \ - -d "$data" \ - --silent \ - --show-error \ - --fail \ - "$url" -} - -# Wrapper for RunnerService calls -runner_api_call() { - local method="$1" - local data="$2" - gitpod_api_call "RunnerService" "$method" "$data" -} - -create_runner() { - log_info "Creating runner..." - - local request_data - request_data=$(jq -n \ - --arg name "$TEST_ID" \ - --arg region "$GCP_REGION" \ - '{ - name: $name, - provider: "RUNNER_PROVIDER_GCP", - spec: { - desiredPhase: "RUNNER_PHASE_ACTIVE", - configuration: { - region: $region, - autoUpdate: true, - releaseChannel: "RUNNER_RELEASE_CHANNEL_STABLE" - }, - variant: "RUNNER_VARIANT_ENTERPRISE" - } - }') - - local response - if ! response=$(runner_api_call "CreateRunner" "$request_data"); then - log_error "Failed to create runner. Check that:" - log_error " 1. GITPOD_TOKEN has runner management permissions" - log_error " 2. You are an organization admin" - log_error " 3. GCP runner creation is enabled for your organization" - exit $EXIT_AUTH_FAILURE - fi - - RUNNER_ID=$(echo "$response" | jq -r '.runner.runnerId') - local access_token - # Try exchangeToken first (new API), fallback to accessToken (deprecated) - access_token=$(echo "$response" | jq -r '.exchangeToken // .accessToken') - - if [[ "$RUNNER_ID" == "null" || -z "$RUNNER_ID" ]]; then - log_error "Failed to extract runner ID from response" - exit $EXIT_GENERAL_FAILURE - fi - - if [[ "$access_token" == "null" || -z "$access_token" ]]; then - log_error "Failed to extract access token from response" - exit $EXIT_GENERAL_FAILURE - fi - - log_info "Created runner: $RUNNER_ID" - - # Export for terraform - export TF_VAR_runner_id="$RUNNER_ID" - export TF_VAR_runner_token="$access_token" -} - -get_runner_status() { - local runner_id="$1" - - local request_data - request_data=$(jq -n --arg id "$runner_id" '{runnerId: $id}') - - local response - if ! response=$(runner_api_call "GetRunner" "$request_data"); then - return 1 - fi - - echo "$response" | jq -r '.runner.status.phase // "UNKNOWN"' -} - -delete_runner() { - local runner_id="$1" - - log_info "Deleting runner: $runner_id" - - local request_data - request_data=$(jq -n --arg id "$runner_id" '{runnerId: $id}') - - if runner_api_call "DeleteRunner" "$request_data" >/dev/null; then - log_info "Successfully deleted runner" - return 0 - else - log_warn "Failed to delete runner (may not exist)" - return 1 - fi -} - -# Terraform functions -setup_terraform() { - log_info "Setting up Terraform configuration..." - - # Create tfvars file for this test - TFVARS_FILE="$TERRAFORM_DIR/e2e-test-$TEST_ID.tfvars" - - cat > "$TFVARS_FILE" </dev/null 2>&1; then - log_info "Creating DNS delegation configuration..." - cat > "$delegation_tf" < "$delegation_tf" </dev/null || echo "N/A") - local dns_zone - dns_zone=$(terraform output -raw dns_zone_name 2>/dev/null || echo "N/A") - - log_info "Infrastructure deployed successfully!" - log_info " • Load Balancer IP: $lb_ip" - log_info " • DNS Zone: $dns_zone" - log_info " • Runner Domain: $TEST_ID.tests.doptig.com" - else - log_error "Terraform apply failed" - exit $EXIT_INFRA_FAILURE - fi -} - -destroy_terraform() { - log_info "Destroying Terraform infrastructure..." - - cd "$TERRAFORM_DIR" - - if [[ -f "$(basename "$TFVARS_FILE")" ]]; then - if terraform destroy -auto-approve -input=false -no-color -var-file="$(basename "$TFVARS_FILE")"; then - TERRAFORM_APPLIED=false - log_info "Terraform destroy completed successfully" - return 0 - else - log_error "Terraform destroy failed" - return 1 - fi - else - log_warn "Tfvars file not found, attempting destroy without var-file" - if terraform destroy -auto-approve -input=false -no-color; then - TERRAFORM_APPLIED=false - log_info "Terraform destroy completed successfully" - return 0 - else - log_error "Terraform destroy failed" - return 1 - fi - fi -} - -# Wait for runner to come online -wait_for_runner_online() { - log_info "Waiting for runner to come online..." - - local max_attempts=90 # 30 minutes with 20s intervals - local attempt=1 - local sleep_time=20 - - while [[ $attempt -le $max_attempts ]]; do - log_info "Checking runner status (attempt $attempt/$max_attempts)..." - - local status - if status=$(get_runner_status "$RUNNER_ID"); then - log_info "Runner status: $status" - - if [[ "$status" == "RUNNER_PHASE_ACTIVE" ]]; then - log_info "✅ Runner is online and ready for use!" - return 0 - fi - else - log_warn "Failed to get runner status" - fi - - if [[ $attempt -lt $max_attempts ]]; then - log_info "Waiting ${sleep_time}s before next check..." - sleep $sleep_time - fi - - ((attempt++)) - done - - log_error "Runner failed to come online within 30 minutes timeout" - exit $EXIT_GENERAL_FAILURE -} - -# Main test workflow -run_e2e_test() { - log_info "Starting GCP Runner E2E test..." - - validate_environment - create_runner - setup_terraform - apply_terraform - wait_for_runner_online - - log_info "All test phases completed successfully!" -} - -# Main function -main() { - case "${1:-}" in - --help|-h) - echo "Usage: $0 [--help]" - echo "" - echo "Options:" - echo " --help, -h Show this help message" - echo "" - echo "Environment variables required:" - echo " GCP_PROJECT_ID GCP project ID for testing" - echo " GITPOD_TOKEN Organization-specific PAT token with runner management permissions" - echo "" - echo "Optional:" - echo " GCP_REGION GCP region (default: us-central1)" - echo " GITPOD_API_ENDPOINT API endpoint (default: https://app.gitpod.io/api)" - echo " E2E_TEST_ID Custom test identifier" - echo "" - echo "GCP Authentication (choose one):" - echo " GOOGLE_APPLICATION_CREDENTIALS Path to service account key file" - echo " OR use existing gcloud authentication (gcloud auth login)" - echo "" - echo "Note: Cleanup is handled automatically via EXIT trap" - ;; - "") - run_e2e_test - ;; - *) - log_error "Unknown option: '$1'" - echo "Use --help for usage information" - echo "Valid usage: $0 [--help]" - exit $EXIT_CONFIG_ERROR - ;; - esac -} - -# Run main function with all arguments -main "$@" diff --git a/variables.tf b/variables.tf index 98b5c7b..a66481d 100644 --- a/variables.tf +++ b/variables.tf @@ -281,26 +281,25 @@ variable "kms_key_name" { variable "pre_created_service_accounts" { description = "Pre-created service accounts to use instead of creating new ones. If provided, all IAM resources become optional." type = object({ - runner = optional(string, "") - environment_vm = optional(string, "") + runner = optional(string, "") + environment_vm = optional(string, "") + proxy_vm = optional(string, "") + + # Deprecated: values are ignored. Kept for backward compatibility. build_cache = optional(string, "") secret_manager = optional(string, "") pubsub_processor = optional(string, "") - proxy_vm = optional(string, "") }) default = { - runner = "" - environment_vm = "" - build_cache = "" - secret_manager = "" - pubsub_processor = "" - proxy_vm = "" + runner = "" + environment_vm = "" + proxy_vm = "" } validation { condition = alltrue([ - for sa in values(var.pre_created_service_accounts) : - sa == "" || can(regex("^[a-z][a-z0-9-]{4,28}[a-z0-9]@[a-z0-9-]+\\.iam\\.gserviceaccount\\.com$", sa)) + for k, sa in var.pre_created_service_accounts : + sa == "" || contains(["build_cache", "secret_manager", "pubsub_processor"], k) || can(regex("^[a-z][a-z0-9-]{4,28}[a-z0-9]@[a-z0-9-]+\\.iam\\.gserviceaccount\\.com$", sa)) ]) error_message = "Service account emails must be in the format: name@project.iam.gserviceaccount.com" } @@ -338,15 +337,10 @@ variable "enable_agents" { default = true } -variable "honeycomb_api_key" { - description = "Honeycomb API key for development tracing. Enables tracing on the runner and environments when set." - type = string - default = "" - sensitive = true -} - -variable "mig_warm_pool_enabled" { - description = "Enable warm pool support using GCP Managed Instance Groups (MIGs) for faster environment startup" +variable "use_authoritative_project_metadata" { + description = "When true (default), uses authoritative google_compute_project_metadata which manages all project metadata. Set to false to use per-key metadata items that leave other project metadata untouched." type = bool - default = false + default = true } + +