diff --git a/gateway/it/Makefile b/gateway/it/Makefile index 0971ba4b..fd394daf 100644 --- a/gateway/it/Makefile +++ b/gateway/it/Makefile @@ -20,8 +20,9 @@ # Run integration tests (always with coverage) # Requires: make build-coverage-image (run once before tests) +# Uses script to force unbuffered output on macOS test: - go test -v ./... + script -q /dev/null go test -v -count=1 ./... # Build all coverage-instrumented gateway components build-coverage: diff --git a/gateway/it/config_manager.go b/gateway/it/config_manager.go new file mode 100644 index 00000000..e54cd23e --- /dev/null +++ b/gateway/it/config_manager.go @@ -0,0 +1,71 @@ +package it + +import ( + "context" + "fmt" + "log" + "strings" + "sync" + + "github.com/cucumber/godog" +) + +const ConfigTagPrefix = "@config-" + +// GatewayConfigManager handles configuration switching for the gateway +type GatewayConfigManager struct { + registry *ConfigProfileRegistry + composeManager *ComposeManager + currentProfile string + mu sync.Mutex +} + +// NewGatewayConfigManager creates a new config manager +func NewGatewayConfigManager(cm *ComposeManager) *GatewayConfigManager { + return &GatewayConfigManager{ + registry: NewConfigProfileRegistry(), + composeManager: cm, + currentProfile: "default", // Assume default on startup + } +} + +// EnsureConfig checks the scenario tags and restarts the gateway if a different config is required +func (m *GatewayConfigManager) EnsureConfig(ctx context.Context, sc *godog.Scenario) error { + m.mu.Lock() + defer m.mu.Unlock() + + requiredProfile := m.extractConfigTag(sc) + if requiredProfile == "" { + requiredProfile = "default" + } + + if m.currentProfile == requiredProfile { + return nil // No restart needed + } + + log.Printf("Switching gateway config from '%s' to '%s'...", m.currentProfile, requiredProfile) + + profile, ok := m.registry.Get(requiredProfile) + if !ok { + return fmt.Errorf("unknown config profile: %s", requiredProfile) + } + + // Restart gateway-controller with new env vars + if err := m.composeManager.RestartGatewayController(ctx, profile.EnvVars); err != nil { + return fmt.Errorf("failed to restart gateway with profile %s: %w", requiredProfile, err) + } + + m.currentProfile = requiredProfile + log.Printf("Switched to '%s' profile successfully", requiredProfile) + return nil +} + +// extractConfigTag finds the first tag starting with @config- and returns the suffix +func (m *GatewayConfigManager) extractConfigTag(sc *godog.Scenario) string { + for _, tag := range sc.Tags { + if strings.HasPrefix(tag.Name, ConfigTagPrefix) { + return strings.TrimPrefix(tag.Name, ConfigTagPrefix) + } + } + return "" +} diff --git a/gateway/it/config_profiles.go b/gateway/it/config_profiles.go new file mode 100644 index 00000000..df1420ab --- /dev/null +++ b/gateway/it/config_profiles.go @@ -0,0 +1,74 @@ +package it + +// ConfigProfile defines a named configuration set for the gateway +type ConfigProfile struct { + Name string + EnvVars map[string]string + Description string +} + +// ConfigProfileRegistry manages the available configuration profiles +type ConfigProfileRegistry struct { + profiles map[string]*ConfigProfile + defaultProfile string +} + +// NewConfigProfileRegistry creates a new registry with standard profiles +func NewConfigProfileRegistry() *ConfigProfileRegistry { + registry := &ConfigProfileRegistry{ + profiles: make(map[string]*ConfigProfile), + defaultProfile: "default", + } + + // Register standard profiles + registry.Register(&ConfigProfile{ + Name: "default", + EnvVars: map[string]string{ + "GATEWAY_LOGGING_LEVEL": "info", + "GATEWAY_STORAGE_TYPE": "sqlite", + }, + Description: "Standard configuration using SQLite and Info logging", + }) + + registry.Register(&ConfigProfile{ + Name: "debug", + EnvVars: map[string]string{ + "GATEWAY_LOGGING_LEVEL": "debug", + "GATEWAY_STORAGE_TYPE": "sqlite", + }, + Description: "Debug configuration enabling verbose logging", + }) + + registry.Register(&ConfigProfile{ + Name: "memory", + EnvVars: map[string]string{ + "GATEWAY_LOGGING_LEVEL": "info", + "GATEWAY_STORAGE_TYPE": "memory", + }, + Description: "In-memory storage configuration (non-persistent)", + }) + + registry.Register(&ConfigProfile{ + Name: "tracing", + EnvVars: map[string]string{ + "GATEWAY_LOGGING_LEVEL": "info", + "GATEWAY_STORAGE_TYPE": "memory", + "GATEWAY_TRACING_ENABLED": "true", + }, + Description: "Configuration with OpenTelemetry tracing enabled", + }) + + return registry + +} + +// Register adds a profile to the registry +func (r *ConfigProfileRegistry) Register(profile *ConfigProfile) { + r.profiles[profile.Name] = profile +} + +// Get retrieves a profile by name +func (r *ConfigProfileRegistry) Get(name string) (*ConfigProfile, bool) { + profile, ok := r.profiles[name] + return profile, ok +} diff --git a/gateway/it/configs/config.yaml b/gateway/it/configs/config.yaml new file mode 100644 index 00000000..6a2390d8 --- /dev/null +++ b/gateway/it/configs/config.yaml @@ -0,0 +1,398 @@ +gateway_controller: + + # Server configuration + server: + # REST API port for gateway management + api_port: 9090 + + # xDS gRPC port for Envoy communication + xds_port: 18000 + + # Graceful shutdown timeout + shutdown_timeout: 15s + + # Policy xDS Server configuration + policyserver: + # Enable or disable the policy xDS server + enabled: true + + # Policy xDS gRPC port for policy distribution + port: 18001 + + # TLS configuration for secure policy xDS communication + tls: + # Enable or disable TLS + enabled: false + + # Path to TLS certificate file (required if TLS is enabled) + cert_file: "./certs/server.crt" + + # Path to TLS private key file (required if TLS is enabled) + key_file: "./certs/server.key" + + # Storage configuration + storage: + # Storage type: "sqlite", "postgres" (future), or "memory" + # - sqlite: Use SQLite embedded database for persistence + # - postgres: Use PostgreSQL database for persistence (future support) + # - memory: No persistent storage, all configs lost on restart (useful for testing) + type: sqlite + + # SQLite configuration (used when type=sqlite) + sqlite: + path: ./data/gateway.db + + # Policy configuration + policies: + # Directory containing policy definitions + definitions_path: ./default-policies + + # Router (Envoy) configuration + router: + # Gateway host for incoming requests + gateway_host: "*" + + # Access logs configuration + access_logs: + # Enable or disable access logs + enabled: true + + # Log format: "json" or "text" + # - json: Structured JSON format (recommended for log aggregation) + # - text: Human-readable text format + format: json + + # JSON format fields - key-value pairs for structured logging + # Uses Envoy command operators: https://www.envoyproxy.io/docs/envoy/latest/configuration/observability/access_log/usage + json_fields: + start_time: "%START_TIME%" + method: "%REQ(:METHOD)%" + path: "%REQ(X-ENVOY-ORIGINAL-PATH?:PATH)%" + protocol: "%PROTOCOL%" + response_code: "%RESPONSE_CODE%" + response_flags: "%RESPONSE_FLAGS%" + response_flags_long: "%RESPONSE_FLAGS_LONG%" + bytes_received: "%BYTES_RECEIVED%" + bytes_sent: "%BYTES_SENT%" + duration: "%DURATION%" + upstream_service_time: "%RESP(X-ENVOY-UPSTREAM-SERVICE-TIME)%" + x_forwarded_for: "%REQ(X-FORWARDED-FOR)%" + user_agent: "%REQ(USER-AGENT)%" + request_id: "%REQ(X-REQUEST-ID)%" + authority: "%REQ(:AUTHORITY)%" + upstream_host: "%UPSTREAM_HOST%" + upstream_cluster: "%UPSTREAM_CLUSTER%" + + # Text format template - used when format is "text" + # Uses Envoy command operators: https://www.envoyproxy.io/docs/envoy/latest/configuration/observability/access_log/usage + text_format: | + [%START_TIME%] "%REQ(:METHOD)% %REQ(X-ENVOY-ORIGINAL-PATH?:PATH)% %PROTOCOL%" %RESPONSE_CODE% %RESPONSE_FLAGS% %BYTES_RECEIVED% %BYTES_SENT% %DURATION% "%REQ(X-FORWARDED-FOR)%" "%REQ(USER-AGENT)%" "%REQ(X-REQUEST-ID)%" "%REQ(:AUTHORITY)%" "%UPSTREAM_HOST%" + + # Listener port for incoming HTTP traffic (Envoy proxy port) + listener_port: 8080 + + # HTTPS listener configuration + https_enabled: true # Enable/disable HTTPS listener + https_port: 8443 # HTTPS listener port + + # Downstream TLS configuration (for HTTPS listener) + downstream_tls: + # Path to server certificate (PEM format) + cert_path: "./listener-certs/default-listener.crt" + + # Path to server private key (PEM format) + key_path: "./listener-certs/default-listener.key" + + # Minimum TLS protocol version (TLS1_0, TLS1_1, TLS1_2, TLS1_3) + minimum_protocol_version: TLS1_2 + + # Maximum TLS protocol version (TLS1_0, TLS1_1, TLS1_2, TLS1_3) + maximum_protocol_version: TLS1_3 + + # Cipher suites (comma-separated) + ciphers: "ECDHE-ECDSA-AES128-GCM-SHA256,ECDHE-RSA-AES128-GCM-SHA256,ECDHE-ECDSA-AES128-SHA,ECDHE-RSA-AES128-SHA,AES128-GCM-SHA256,AES128-SHA,ECDHE-ECDSA-AES256-GCM-SHA384,ECDHE-RSA-AES256-GCM-SHA384,ECDHE-ECDSA-AES256-SHA,ECDHE-RSA-AES256-SHA,AES256-GCM-SHA384,AES256-SHA" + + # Upstream configuration + envoy_upstream: + # TLS configuration for upstream connections + tls: + minimum_protocol_version: TLS1_2 + maximum_protocol_version: TLS1_3 + ciphers: "ECDHE-ECDSA-AES128-GCM-SHA256,ECDHE-RSA-AES128-GCM-SHA256,ECDHE-ECDSA-AES128-SHA,ECDHE-RSA-AES128-SHA,AES128-GCM-SHA256,AES128-SHA,ECDHE-ECDSA-AES256-GCM-SHA384,ECDHE-RSA-AES256-GCM-SHA384,ECDHE-ECDSA-AES256-SHA,ECDHE-RSA-AES256-SHA,AES256-GCM-SHA384,AES256-SHA" + trusted_cert_path: /etc/ssl/certs/ca-certificates.crt + custom_certs_path: ./certificates # Directory containing custom trusted certificates (e.g., self-signed certs) + verify_host_name: true + disable_ssl_verification: false + + # Timeout configurations for upstream connections + timeouts: + route_timeout_in_seconds: 60 + max_route_timeout_in_seconds: 60 + route_idle_timeout_in_seconds: 300 + tracing_service_name: router + + + # Policy Engine ext_proc filter configuration + policy_engine: + # Enable or disable policy engine ext_proc filter + enabled: true + + # Policy engine host (hostname or IP address) + host: policy-engine + + # Policy engine ext_proc port + port: 9001 + + # Timeout for gRPC service connection (in milliseconds) + timeout_ms: 60000 + + # Failure mode: false = fail closed (deny requests on error), true = fail open (allow requests on error) + failure_mode_allow: false + + # Route cache action: DEFAULT, RETAIN, or CLEAR + # RETAIN: Maintain the route cache across requests (recommended for performance) + route_cache_action: RETAIN + + # Allow per-route override of ext_proc configuration + allow_mode_override: true + + # Request header processing mode: DEFAULT, SEND, or SKIP + # SEND: Forward request headers to policy engine for processing + request_header_mode: SEND + + # Message timeout for policy engine processing (in milliseconds) + message_timeout_ms: 60000 + + # TLS configuration for policy engine connection + tls: + # Enable TLS for secure communication with policy engine + enabled: false + + # Client certificate path (for mutual TLS authentication) + cert_path: "" + + # Client private key path (for mutual TLS authentication) + key_path: "" + + # CA certificate path (for server certificate validation) + ca_path: "" + + # Server name for SNI (optional, defaults to host) + server_name: "" + + # Skip server certificate verification (insecure, development only) + skip_verify: false + + + # Authentication configuration + # - `users`: List of locally defined users (username/password + local roles). + # Passwords can be plain text or hashed depending on your implementation. We support argon2id and bcrypt hashing. + # - `idp`: Optional IDP (OIDC/JWKS) configuration to validate incoming JWTs. + # `role_mapping` maps local roles to one or more roles in the IDP. + auth: + # Local basic authentication configuration + basic: + enabled: true + # Local users + users: + # Example user entries: + # - username: "user1" + # password: "$argon2id.." # encrypted hash or plain string depending on implementation + # password_hashed: true + # roles: ["developer", "viewer"] + - username: "admin" + password: "admin" + password_hashed: false + roles: ["admin"] + # IDP (OIDC/JWKS) configuration for validating JWTs (optional) + idp: + enabled: false + # JWKS endpoint used to fetch public keys for JWT validation + jwks_url: "" + # Expected issuer value in the JWT `iss` claim + issuer: "" + # Claim path which contains roles (e.g. "roles", "realm_access.roles", etc.) + roles_claim: "scope" + # Map idp role to local roles. Example maps idp->local-role(s): + # role_mapping: + # engineer: + # - "developer" + # - "consumer" + # admin: + # - "developer" + # - "admin" + # - "consumer" + role_mapping: + + # Logging configuration + logging: + # Log level: "debug", "info", "warn", or "error" + level: debug + + # Log format: "json" or "console" + # - json: Structured JSON format (recommended for production) + # - console: Human-readable console format (recommended for development) + format: json + +policy_engine: + server: + # Port for ext_proc gRPC server (receives requests from Envoy) + extproc_port: 9001 + + # Admin HTTP server configuration + admin: + # Enable admin HTTP server for debugging endpoints + enabled: true + + # Port for admin HTTP server + port: 9002 + + # IP addresses allowed to access the admin API + # Defaults to localhost only for security + allowed_ips: + - "*" + - "127.0.0.1" + + config_mode: + # Configuration mode: "file" or "xds" + # - file: Load policy chains from static YAML file + # - xds: Subscribe to xDS server for dynamic configuration updates + mode: xds + + # xDS client configuration + xds: + # Enable xDS client + enabled: true + + # xDS server address (typically the gateway-controller xDS server) + server_address: gateway-controller:18001 + + # Node identifier for this policy engine instance + # Used by xDS server to identify this client + node_id: policy-engine-1 + + # Cluster identifier for this policy engine instance + # Used for grouping multiple policy engine instances + cluster: policy-engine-cluster + + # Connection timeout for establishing initial connection + connect_timeout: 10s + + # Request timeout for individual xDS requests + request_timeout: 5s + + # Initial delay before reconnecting after connection failure + initial_reconnect_delay: 1s + + # Maximum delay between reconnection attempts (exponential backoff) + max_reconnect_delay: 60s + + # TLS configuration for xDS connection + tls: + enabled: false + # cert_path: /path/to/client-cert.pem + # key_path: /path/to/client-key.pem + # ca_path: /path/to/ca-cert.pem + + # File-based configuration (not used in xDS mode) + file_config: + path: "" + + # Logging configuration + logging: + # Log level: debug, info, warn, error + level: debug + + # Log format: json, text + format: json + +analytics: + enabled: true + # Publishers specific configuration + publishers: + - type: moesif + enabled: true + settings: + # Moesif application ID + application_id: + # Publish interval in seconds + publish_interval: 5 + # Max number of events held in memory before dropping + event_queue_size: 10000 + # Max number of events in a batch + batch_size: 50 + # Time interval in seconds to wake up the timer + timer_wakeup_seconds: 3 + + # gRPC Access Log Service configuration for router(Envoy) + grpc_access_logs: + # ALS Server hostname + host: policy-engine + + # Log name identifier + log_name: "envoy_access_log" + + # Buffer settings + buffer_flush_interval: 1000000000 # nanoseconds + buffer_size_bytes: 16384 + grpc_request_timeout: 20000000000 # nanoseconds + + # ALS server related configuration + access_logs_service: + # ALS Server gRPC port + als_server_port: 18090 + + # shutdown timeout for the ALS server + shutdown_timeout: 600 #seconds + + # public and private key path for the ALS server + public_key_path: "" + private_key_path: "" + + # whether to use plain text for the ALS server + als_plain_text: true + + # max message size for the ALS server + max_message_size: 1000000000 + + # max header limit for the ALS server + max_header_limit: 8192 + +tracing: + enabled: false + endpoint: otel-collector:4317 + insecure: true + service_version: "1.0.0" + batch_timeout: 1s + max_export_batch_size: 512 + sampling_rate: 1.0 + +policy_configurations: + JWTAuth_v010: + KeyManagers: + - name: WSO2KeyManager1 + issuer: https://api.asgardeo.io/t/tharsanan1995/oauth2/token + jwks: + remote: + uri: https://api.asgardeo.io/t/tharsanan1995/oauth2/jwks + skipTlsVerify: false + - name: WSO2KeyManager2 + issuer: https://api.asgardeo.io/t/testorgforsecurity/oauth2/token + jwks: + remote: + uri: https://api.asgardeo.io/t/testorgforsecurity/oauth2/jwks + skipTlsVerify: false + JwksCacheTtl: "5m" + JwksFetchTimeout: "5s" + JwksFetchRetryCount: 3 + JwksFetchRetryInterval: "2s" + AllowedAlgorithms: + - RS256 + - ES256 + Leeway: "30s" + AuthHeaderScheme: Bearer + HeaderName: Authorization + OnFailureStatusCode: 401 + ErrorMessageFormat: json + ErrorMessage: "Authentication failed." + ValidateIssuer: true diff --git a/gateway/it/docker-compose.test.yaml b/gateway/it/docker-compose.test.yaml index 07b574b8..206de372 100644 --- a/gateway/it/docker-compose.test.yaml +++ b/gateway/it/docker-compose.test.yaml @@ -24,13 +24,14 @@ services: container_name: it-gateway-controller image: ghcr.io/wso2/api-platform/gateway-controller-coverage:0.1.0-SNAPSHOT ports: - - "9090:9090" # REST API + - "9099:9090" # REST API - "18000:18000" # xDS gRPC - "18001:18001" environment: - GATEWAY_STORAGE_TYPE=sqlite - GATEWAY_STORAGE_SQLITE_PATH=./data/gateway.db - GATEWAY_LOGGING_LEVEL=info + - GATEWAY_TRACING_ENABLED=${GATEWAY_TRACING_ENABLED:-false} - GOCOVERDIR=/coverage volumes: - controller-data-tests:/app/data @@ -53,6 +54,8 @@ services: command: ["-xds-server", "it-gateway-controller:18001"] ports: - "9002:9002" # Admin API + environment: + - GATEWAY_TRACING_ENABLED=${GATEWAY_TRACING_ENABLED:-false} volumes: - ../configs/config.yaml:/app/configs/config.yaml:ro depends_on: @@ -61,6 +64,19 @@ services: networks: - it-gateway-network + otel-collector: + image: otel/opentelemetry-collector-contrib:0.88.0 + container_name: it-otel-collector + command: ["--config=/etc/otel-collector-config.yaml"] + volumes: + - ./otel-config.yaml:/etc/otel-collector-config.yaml:ro + ports: + - "4317:4317" # OTLP gRPC receiver + - "4318:4318" # OTLP HTTP receiver + networks: + - it-gateway-network + + router: container_name: it-router image: ghcr.io/wso2/api-platform/gateway-router:0.1.0-SNAPSHOT @@ -87,7 +103,7 @@ services: container_name: it-sample-backend image: renukafernando/request-info:latest ports: - - "5000:5000" + - "5050:5000" command: ["-addr", ":5000", "-pretty"] networks: - it-gateway-network diff --git a/gateway/it/features/tracing.feature b/gateway/it/features/tracing.feature new file mode 100644 index 00000000..9d4ad104 --- /dev/null +++ b/gateway/it/features/tracing.feature @@ -0,0 +1,39 @@ +Feature: Distributed Tracing + As a developer + I want to ensure that API requests are traced + So that I can observe the system behavior + + @config-tracing + Scenario: API invocation generates a trace + Given the Gateway is running with tracing enabled + And I authenticate using basic auth as "admin" + When I deploy this API configuration: + """ + apiVersion: gateway.api-platform.wso2.com/v1alpha1 + kind: RestApi + metadata: + name: tracing-api-v1.0 + spec: + displayName: Tracing-API + version: v1.0 + context: /tracing/v1.0 + upstream: + main: + url: http://sample-backend:5000/api/v2 + operations: + - method: GET + path: /{country_code}/{city} + - method: GET + path: /alerts/active + - method: POST + path: /alerts/active + """ + Then the response should be successful + And the response should be valid JSON + And the JSON response field "status" should be "success" + And I wait for 2 seconds + When I send a GET request to "http://localhost:8080/tracing/v1.0/us/seattle" + Then the response should be successful + And the response should be valid JSON + And the response body should contain "/api/v2/us/seattle" + And I should see a trace for "Tracing-API" in the OpenTelemetry collector logs diff --git a/gateway/it/otel-config.yaml b/gateway/it/otel-config.yaml new file mode 100644 index 00000000..449bb377 --- /dev/null +++ b/gateway/it/otel-config.yaml @@ -0,0 +1,16 @@ +receivers: + otlp: + protocols: + grpc: + http: + +exporters: + logging: + loglevel: debug + +service: + pipelines: + traces: + receivers: [otlp] + processors: [] + exporters: [logging] diff --git a/gateway/it/setup.go b/gateway/it/setup.go index 5706225e..184f50c7 100644 --- a/gateway/it/setup.go +++ b/gateway/it/setup.go @@ -28,6 +28,7 @@ import ( "os/exec" "os/signal" "path/filepath" + "strings" "sync" "syscall" "time" @@ -46,7 +47,7 @@ const ( HealthCheckInterval = 2 * time.Second // GatewayControllerPort is the REST API port for gateway-controller - GatewayControllerPort = "9090" + GatewayControllerPort = "9099" // RouterPort is the HTTP traffic port for the router RouterPort = "8080" @@ -91,8 +92,14 @@ func NewComposeManager(composeFile string) (*ComposeManager, error) { ctx, cancel := context.WithCancel(context.Background()) - // Create compose stack using testcontainers-go - compose, err := tc.NewDockerCompose(absPath) + // Create compose stack using testcontainers-go with explicit project name + // This ensures that RestartGatewayController uses the same project name + // for docker compose commands, keeping all containers on the same network + projectName := "gateway-it" + compose, err := tc.NewDockerComposeWith( + tc.StackIdentifier(projectName), + tc.WithStackFiles(absPath), + ) if err != nil { cancel() return nil, fmt.Errorf("failed to create docker compose: %w", err) @@ -101,7 +108,7 @@ func NewComposeManager(composeFile string) (*ComposeManager, error) { cm := &ComposeManager{ compose: compose, composeFile: absPath, - projectName: "gateway-it", + projectName: projectName, ctx: ctx, cancel: cancel, signalChan: make(chan os.Signal, 1), @@ -139,6 +146,9 @@ func (cm *ComposeManager) Start() error { return fmt.Errorf("failed to start docker compose: %w", err) } + // Stream logs in background + cm.StreamLogs() + log.Println("Docker Compose services started, waiting for health checks...") // Wait for services to be healthy (additional verification) @@ -153,9 +163,174 @@ func (cm *ComposeManager) Start() error { } log.Println("All services are healthy and ready") + cm.PrintServiceStatus() return nil } +// PrintServiceStatus prints the status of docker compose services +func (cm *ComposeManager) PrintServiceStatus() { + cmd := execCommandContext(cm.ctx, "docker", "compose", "-f", cm.composeFile, "-p", cm.projectName, "ps") + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + log.Println("--- Docker Compose Services ---") + if err := cmd.Run(); err != nil { + log.Printf("Failed to run docker compose ps: %v", err) + } + log.Println("-------------------------------") +} + +// RestartGatewayController restarts the gateway-controller, policy-engine, and router services with specific environment variables +func (cm *ComposeManager) RestartGatewayController(ctx context.Context, envVars map[string]string) error { + // Project name is "gateway-it", service is "gateway-controller". + // Default naming is usually project-service-1 or project_service_1. + // However, explicit container_name might be set in compose file. + // Given the context, we should rely on docker compose commands rather than assuming container name for 'docker stop/rm' + // BUT the prompt explicitly used "docker stop containerName" and "docker rm containerName". + // I should check if I can just use `docker compose stop` and `docker compose up`. + // The prompt suggested: + // exec.CommandContext(ctx, "docker", "stop", containerName).Run() + // exec.CommandContext(ctx, "docker", "rm", containerName).Run() + // args := []string{"compose", "-f", cm.composeFile, "-p", cm.projectName, "up", "-d", "gateway-controller"} + + // I'll stick to 'docker compose' commands to be safe with names. + + log.Println("Restarting gateway services with new configuration...") + + // Stop and remove gateway-controller, policy-engine, and router + // We restart all three to ensure they reconnect to xDS with new config + stopCmd := execCommandContext(ctx, "docker", "compose", "-f", cm.composeFile, "-p", cm.projectName, "stop", "gateway-controller", "policy-engine", "router") + if err := stopCmd.Run(); err != nil { + return fmt.Errorf("failed to stop services: %w", err) + } + + // Force remove the containers by declared name to avoid conflicts + // We use direct docker rm because compose rm sometimes doesn't clear the name reservation fast enough + // or behaves differently with static container_names. + rmCmd := execCommandContext(ctx, "docker", "rm", "-f", "it-gateway-controller", "it-policy-engine", "it-router") + // We ignore error here because if it doesn't exist, that's fine. + _ = rmCmd.Run() + + // Build environment for docker compose commands + composeEnv := os.Environ() + for k, v := range envVars { + composeEnv = append(composeEnv, fmt.Sprintf("%s=%s", k, v)) + log.Printf("Setting env: %s=%s", k, v) + } + + // Start gateway-controller with new env vars + log.Println("Starting gateway-controller...") + args := []string{"compose", "-f", cm.composeFile, "-p", cm.projectName, "up", "-d", "gateway-controller"} + cmd := execCommandContext(ctx, "docker", args...) + cmd.Env = composeEnv + + output, err := cmd.CombinedOutput() + if err != nil { + return fmt.Errorf("failed to start gateway-controller: %w\nOutput: %s", err, string(output)) + } + + // Wait for gateway-controller health check + if err := cm.WaitForGatewayControllerHealthy(ctx); err != nil { + return err + } + + // Start policy-engine with same env vars (it depends on gateway-controller) + log.Println("Starting policy-engine...") + policyArgs := []string{"compose", "-f", cm.composeFile, "-p", cm.projectName, "up", "-d", "policy-engine"} + policyCmd := execCommandContext(ctx, "docker", policyArgs...) + policyCmd.Env = composeEnv + policyOutput, err := policyCmd.CombinedOutput() + if err != nil { + return fmt.Errorf("failed to start policy-engine: %w\nOutput: %s", err, string(policyOutput)) + } + + // Now start router (it depends on gateway-controller) + log.Println("Starting router...") + routerArgs := []string{"compose", "-f", cm.composeFile, "-p", cm.projectName, "up", "-d", "router"} + routerCmd := execCommandContext(ctx, "docker", routerArgs...) + routerCmd.Env = composeEnv // Router also needs env vars for consistency + routerOutput, err := routerCmd.CombinedOutput() + if err != nil { + return fmt.Errorf("failed to start router: %w\nOutput: %s", err, string(routerOutput)) + } + + cm.PrintServiceStatus() + + // Wait for router to be ready and give time for xDS config propagation + if err := cm.WaitForRouterReady(ctx); err != nil { + return err + } + + // Additional wait for xDS configuration to propagate + log.Println("Waiting for xDS configuration to propagate...") + time.Sleep(3 * time.Second) + + log.Println("Gateway configuration change complete") + return nil +} + +// WaitForGatewayControllerHealthy waits for the gateway-controller to be healthy +func (cm *ComposeManager) WaitForGatewayControllerHealthy(ctx context.Context) error { + endpoint := fmt.Sprintf("http://localhost:%s/health", GatewayControllerPort) + client := &http.Client{ + Timeout: 2 * time.Second, + } + + ticker := time.NewTicker(500 * time.Millisecond) + defer ticker.Stop() + + timeoutCtx, cancel := context.WithTimeout(ctx, 30*time.Second) + defer cancel() + + for { + select { + case <-timeoutCtx.Done(): + return fmt.Errorf("timeout waiting for gateway-controller to be healthy") + case <-ticker.C: + resp, err := client.Get(endpoint) + if err != nil { + continue + } + resp.Body.Close() + + if resp.StatusCode == http.StatusOK { + return nil + } + } + } +} + +// WaitForRouterReady waits for the router (Envoy) to be ready +func (cm *ComposeManager) WaitForRouterReady(ctx context.Context) error { + endpoint := fmt.Sprintf("http://localhost:%s/ready", EnvoyAdminPort) + client := &http.Client{ + Timeout: 2 * time.Second, + } + + ticker := time.NewTicker(500 * time.Millisecond) + defer ticker.Stop() + + timeoutCtx, cancel := context.WithTimeout(ctx, 30*time.Second) + defer cancel() + + for { + select { + case <-timeoutCtx.Done(): + return fmt.Errorf("timeout waiting for router to be ready") + case <-ticker.C: + resp, err := client.Get(endpoint) + if err != nil { + continue + } + resp.Body.Close() + + if resp.StatusCode == http.StatusOK { + log.Println("Router is ready after gateway-controller restart") + return nil + } + } + } +} + // WaitForHealthy waits for all services to pass health checks func (cm *ComposeManager) WaitForHealthy(ctx context.Context) error { services := []struct { @@ -245,26 +420,74 @@ func (cm *ComposeManager) Cleanup() { cm.isShutdown = true log.Println("Cleaning up Docker Compose services...") - - // Cancel context to stop any ongoing operations cm.cancel() - // Stop signal handling - signal.Stop(cm.signalChan) - close(cm.signalChan) + cleanupCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() - // Run docker compose down with cleanup context - cleanupCtx, cleanupCancel := context.WithTimeout(context.Background(), 30*time.Second) - defer cleanupCancel() + // First try testcontainers-go compose down + if err := cm.compose.Down(cleanupCtx, tc.RemoveOrphans(true), tc.RemoveImagesLocal, tc.RemoveVolumes(true)); err != nil { + log.Printf("Testcontainers compose down warning: %v", err) + } - if err := cm.compose.Down(cleanupCtx, tc.RemoveOrphans(true), tc.RemoveVolumes(true)); err != nil { - log.Printf("Warning: error during cleanup: %v", err) + // Also run direct docker compose down to catch any containers started outside testcontainers tracking + // (e.g., containers restarted via RestartGatewayController) + log.Println("Running direct docker compose down for complete cleanup...") + cmd := execCommandContext(cleanupCtx, "docker", "compose", "-f", cm.composeFile, "-p", cm.projectName, "down", "-v", "--remove-orphans") + if output, err := cmd.CombinedOutput(); err != nil { + log.Printf("Direct docker compose down warning: %v, output: %s", err, string(output)) } - log.Println("Cleanup complete") + // Explicitly remove the test volume in case it wasn't removed by compose down + volumeName := cm.projectName + "_controller-data-tests" + log.Printf("Removing volume %s...", volumeName) + volCmd := execCommandContext(cleanupCtx, "docker", "volume", "rm", "-f", volumeName) + if output, err := volCmd.CombinedOutput(); err != nil { + log.Printf("Volume removal warning (may not exist): %v, output: %s", err, string(output)) + } }) } +// StreamLogs streams service logs to stdout +func (cm *ComposeManager) StreamLogs() { + go func() { + log.Println("Streaming logs from containers...") + cmd := execCommandContext(cm.ctx, "docker", "compose", "-f", cm.composeFile, "-p", cm.projectName, "logs", "-f") + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + if err := cmd.Run(); err != nil { + // Don't log error on context cancellation (standard shutdown) + if cm.ctx.Err() == nil { + log.Printf("Background log streaming stopped: %v", err) + } + } + }() +} + +// CheckLogsForText checks if a container's logs contain specific text +func (cm *ComposeManager) CheckLogsForText(ctx context.Context, containerName, text string) (bool, error) { + // Need to use the actual container name (project name + service name usually, or explicit name) + // In our compose file, we set container_name explicitly (e.g., it-otel-collector) + + cmd := execCommandContext(ctx, "docker", "logs", containerName) + output, err := cmd.CombinedOutput() + if err != nil { + return false, fmt.Errorf("failed to get logs for %s: %w", containerName, err) + } + + return strings.Contains(string(output), text), nil +} + +// GetContainerLogs returns the logs of a container +func (cm *ComposeManager) GetContainerLogs(ctx context.Context, containerName string) (string, error) { + cmd := execCommandContext(ctx, "docker", "logs", containerName) + output, err := cmd.CombinedOutput() + if err != nil { + return "", fmt.Errorf("failed to get logs for %s: %w", containerName, err) + } + return string(output), nil +} + // CheckDockerAvailable verifies that Docker is running and accessible func CheckDockerAvailable() error { // Check Docker by running a simple command @@ -287,7 +510,7 @@ func CheckPortsAvailable() error { "8443", // HTTPS EnvoyAdminPort, // 9901 "9002", // Policy engine - "5000", // Sample backend + "5050", // Sample backend "18000", // xDS gRPC "18001", // xDS gRPC } diff --git a/gateway/it/state.go b/gateway/it/state.go index 54fda469..0218a735 100644 --- a/gateway/it/state.go +++ b/gateway/it/state.go @@ -47,7 +47,7 @@ func DefaultConfig() *Config { GatewayControllerURL: fmt.Sprintf("http://localhost:%s", GatewayControllerPort), RouterURL: fmt.Sprintf("http://localhost:%s", RouterPort), PolicyEngineURL: "http://localhost:9002", - SampleBackendURL: "http://localhost:5000", + SampleBackendURL: "http://localhost:5050", HTTPTimeout: 10 * time.Second, Users: map[string]AuthUser{ "admin": {Username: "admin", Password: "admin"}, diff --git a/gateway/it/suite_test.go b/gateway/it/suite_test.go index f22717f7..7e3bcd75 100644 --- a/gateway/it/suite_test.go +++ b/gateway/it/suite_test.go @@ -20,6 +20,7 @@ package it import ( "context" + "fmt" "log" "os" "path/filepath" @@ -47,8 +48,20 @@ var ( // testReporter manages test report generation testReporter *TestReporter + + // gatewayConfigManager manages gateway configuration profiles + gatewayConfigManager *GatewayConfigManager ) +var opts = godog.Options{ + Format: "pretty", + Paths: []string{"features"}, +} + +func init() { + godog.BindCommandLineFlags("godog.", &opts) +} + // TestFeatures is the main entry point for BDD tests func TestFeatures(t *testing.T) { // Initialize test reporter @@ -57,14 +70,20 @@ func TestFeatures(t *testing.T) { log.Printf("Warning: Failed to setup test reporter: %v", err) } + // Register cleanup to run on any test exit (including panics and failures) + t.Cleanup(func() { + if composeManager != nil { + log.Println("Running t.Cleanup: ensuring containers and volumes are removed...") + composeManager.Cleanup() + } + }) + + opts.TestingT = t + suite := godog.TestSuite{ TestSuiteInitializer: InitializeTestSuite, ScenarioInitializer: InitializeScenario, - Options: &godog.Options{ - Format: "pretty", - Paths: []string{"features"}, - TestingT: t, - }, + Options: &opts, } exitCode := suite.Run() @@ -78,6 +97,9 @@ func InitializeTestSuite(ctx *godog.TestSuiteContext) { ctx.BeforeSuite(func() { log.Println("=== Integration Test Suite Starting ===") + // Disable Ryuk reaper to avoid socket mount issues with Colima + os.Setenv("TESTCONTAINERS_RYUK_DISABLED", "true") + // Initialize coverage collector (always enabled) coverageCollector = NewCoverageCollector(DefaultCoverageConfig()) if err := coverageCollector.Setup(); err != nil { @@ -106,6 +128,9 @@ func InitializeTestSuite(ctx *godog.TestSuiteContext) { log.Fatalf("Failed to start services: %v", err) } + // Initialize config manager + gatewayConfigManager = NewGatewayConfigManager(composeManager) + // Initialize global test state testState = NewTestState() @@ -154,6 +179,14 @@ func InitializeScenario(ctx *godog.ScenarioContext) { // Reset state before each scenario ctx.Before(func(ctx context.Context, sc *godog.Scenario) (context.Context, error) { log.Printf("Starting scenario: %s", sc.Name) + + // Ensure correct configuration is running + if gatewayConfigManager != nil { + if err := gatewayConfigManager.EnsureConfig(ctx, sc); err != nil { + return ctx, fmt.Errorf("failed to configure gateway: %w", err) + } + } + if testState != nil { testState.Reset() } @@ -187,6 +220,11 @@ func InitializeScenario(ctx *godog.ScenarioContext) { RegisterAPISteps(ctx, testState, httpSteps) } + // Register tracing steps + if composeManager != nil { + RegisterTracingSteps(ctx, composeManager) + } + // Register common HTTP and assertion steps if httpSteps != nil { httpSteps.Register(ctx) diff --git a/gateway/it/tracing_steps.go b/gateway/it/tracing_steps.go new file mode 100644 index 00000000..bcee0518 --- /dev/null +++ b/gateway/it/tracing_steps.go @@ -0,0 +1,54 @@ +package it + +import ( + "context" + "fmt" + "time" + + "log" + + "github.com/cucumber/godog" +) + +// RegisterTracingSteps registers the tracing-related steps +func RegisterTracingSteps(ctx *godog.ScenarioContext, cm *ComposeManager) { + ctx.Step(`^I should see a trace for "([^"]*)" in the OpenTelemetry collector logs$`, func(serviceName string) error { + return verifyTraceInLogs(cm, serviceName) + }) + + ctx.Step(`^the Gateway is running with tracing enabled$`, func() error { + // This is just a readability step, the @config-tracing tag handles the setup. + // We could assert here that the config is correct if we want to be strict. + return nil + }) +} + +func verifyTraceInLogs(cm *ComposeManager, text string) error { + // Retry for a few seconds as logs might be delayed + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + ticker := time.NewTicker(1 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + // capturing logs for debugging + log.Printf("Timeout waiting for trace logs. Dumping logs for %s:", "it-otel-collector") + output, _ := cm.GetContainerLogs(context.Background(), "it-otel-collector") + log.Println(output) + return fmt.Errorf("timed out waiting for trace logs containing '%s'", text) + case <-ticker.C: + found, err := cm.CheckLogsForText(ctx, "it-otel-collector", text) + if err != nil { + // Don't fail immediately on log retrieval error (container might be starting?) + continue + } + if found { + log.Printf("Found trace log containing '%s'", text) + return nil + } + } + } +}