diff --git a/confgenerator/confgenerator.go b/confgenerator/confgenerator.go index 33e659753d..c43a31b0ef 100644 --- a/confgenerator/confgenerator.go +++ b/confgenerator/confgenerator.go @@ -224,8 +224,8 @@ func (uc *UnifiedConfig) GenerateOtelConfig(ctx context.Context, outDir, stateDi agentSelfMetrics := AgentSelfMetrics{ MetricsVersionLabel: metricVersionLabel, LoggingVersionLabel: loggingVersionLabel, - FluentBitPort: fluentbit.MetricsPort, - OtelPort: otel.MetricsPort, + FluentBitPort: int(uc.GetFluentBitMetricsPort()), + OtelPort: int(uc.GetOtelMetricsPort()), OtelRuntimeDir: outDir, } agentSelfMetrics.AddSelfMetricsPipelines(receiverPipelines, pipelines, ctx) @@ -238,6 +238,7 @@ func (uc *UnifiedConfig) GenerateOtelConfig(ctx context.Context, outDir, stateDi LogLevel: uc.getOTelLogLevel(), ReceiverPipelines: receiverPipelines, Pipelines: pipelines, + MetricsPort: uc.GetOtelMetricsPort(), Exporters: map[otel.ExporterType]otel.ExporterComponents{ otel.System: { Exporter: googleCloudExporter(userAgent, false, false), @@ -608,7 +609,7 @@ func (uc *UnifiedConfig) generateFluentbitComponents(ctx context.Context, userAg out = append(out, addGceMetadataAttributesProcessor(ctx).Components(ctx, "*", "*.default-data-proc.gce_metadata")...) } out = append(out, uc.generateSelfLogsComponents(ctx, userAgent)...) - out = append(out, fluentbit.MetricsOutputComponent()) + out = append(out, fluentbit.MetricsOutputComponent(int(uc.GetFluentBitMetricsPort()))) return out, nil } diff --git a/confgenerator/config.go b/confgenerator/config.go index 0bf6038dc9..942e31ee01 100644 --- a/confgenerator/config.go +++ b/confgenerator/config.go @@ -18,11 +18,13 @@ import ( "context" "fmt" "log" + "os" "path/filepath" "reflect" "runtime" "slices" "sort" + "strconv" "strings" "time" @@ -66,6 +68,24 @@ func (uc *UnifiedConfig) HasCombined() bool { return uc.Combined != nil } +func (uc *UnifiedConfig) GetFluentBitMetricsPort() uint16 { + if portStr := os.Getenv("OPS_AGENT_FLUENT_BIT_METRICS_PORT"); portStr != "" { + if port, err := strconv.ParseUint(portStr, 10, 16); err == nil { + return uint16(port) + } + } + return fluentbit.MetricsPort +} + +func (uc *UnifiedConfig) GetOtelMetricsPort() uint16 { + if portStr := os.Getenv("OPS_AGENT_OTEL_METRICS_PORT"); portStr != "" { + if port, err := strconv.ParseUint(portStr, 10, 16); err == nil { + return uint16(port) + } + } + return otel.MetricsPort +} + func (uc *UnifiedConfig) DeepCopy(ctx context.Context) (*UnifiedConfig, error) { toYaml, err := yaml.Marshal(uc) if err != nil { diff --git a/confgenerator/fluentbit/metrics.go b/confgenerator/fluentbit/metrics.go index 0bf2ab86de..27f18f0cb0 100644 --- a/confgenerator/fluentbit/metrics.go +++ b/confgenerator/fluentbit/metrics.go @@ -29,7 +29,7 @@ func MetricsInputComponent() Component { } } -func MetricsOutputComponent() Component { +func MetricsOutputComponent(port int) Component { return Component{ Kind: "OUTPUT", Config: map[string]string{ @@ -37,7 +37,7 @@ func MetricsOutputComponent() Component { "Name": "prometheus_exporter", "Match": "*", "host": "0.0.0.0", - "port": strconv.Itoa(MetricsPort), + "port": strconv.Itoa(port), }, } } diff --git a/confgenerator/otel/modular.go b/confgenerator/otel/modular.go index 31fa3e1858..0ce0f95505 100644 --- a/confgenerator/otel/modular.go +++ b/confgenerator/otel/modular.go @@ -133,6 +133,7 @@ type ModularConfig struct { Pipelines map[string]Pipeline Exporters map[ExporterType]ExporterComponents Extensions map[string]Component + MetricsPort uint16 // Test-only options: // Don't generate any self-metrics @@ -171,7 +172,7 @@ func (c ModularConfig) Generate(ctx context.Context) (string, error) { "exporter": map[string]interface{}{ "prometheus": map[string]interface{}{ "host": "0.0.0.0", - "port": MetricsPort, + "port": c.MetricsPort, // See https://docs.datadoghq.com/opentelemetry/migrate/collector_0_120_0/#changes-to-prometheus-server-reader-defaults for why these fields are needed. // See https://github.com/open-telemetry/opentelemetry-collector/pull/11611/files#diff-150d72bc611b4b0de17f646768979b15936f820a029cafa91c4037d50ae47e5a for the actual upstream otel code changes. diff --git a/confgenerator/port_env_test.go b/confgenerator/port_env_test.go new file mode 100644 index 0000000000..fe6e41748b --- /dev/null +++ b/confgenerator/port_env_test.go @@ -0,0 +1,61 @@ +package confgenerator + +import ( + "os" + "testing" + + "github.com/GoogleCloudPlatform/ops-agent/confgenerator/fluentbit" + "github.com/GoogleCloudPlatform/ops-agent/confgenerator/otel" +) + +func TestPortOverriddenByEnv(t *testing.T) { + // Set env vars + os.Setenv("OPS_AGENT_FLUENT_BIT_METRICS_PORT", "40002") + os.Setenv("OPS_AGENT_OTEL_METRICS_PORT", "40001") + defer os.Unsetenv("OPS_AGENT_FLUENT_BIT_METRICS_PORT") + defer os.Unsetenv("OPS_AGENT_OTEL_METRICS_PORT") + + uc := &UnifiedConfig{} + + if port := uc.GetFluentBitMetricsPort(); port != 40002 { + t.Errorf("Expected Fluent Bit port 40002, got %d", port) + } + + if port := uc.GetOtelMetricsPort(); port != 40001 { + t.Errorf("Expected OTel port 40001, got %d", port) + } +} + +func TestPortDefaultWhenEnvEmpty(t *testing.T) { + // Ensure env vars are not set + os.Unsetenv("OPS_AGENT_FLUENT_BIT_METRICS_PORT") + os.Unsetenv("OPS_AGENT_OTEL_METRICS_PORT") + + uc := &UnifiedConfig{} + + if port := uc.GetFluentBitMetricsPort(); port != fluentbit.MetricsPort { + t.Errorf("Expected Fluent Bit default port %d, got %d", fluentbit.MetricsPort, port) + } + + if port := uc.GetOtelMetricsPort(); port != otel.MetricsPort { + t.Errorf("Expected OTel default port %d, got %d", otel.MetricsPort, port) + } +} + +func TestPortInvalidEnvFallbacksToDefault(t *testing.T) { + // Set invalid env vars + os.Setenv("OPS_AGENT_FLUENT_BIT_METRICS_PORT", "invalid") + os.Setenv("OPS_AGENT_OTEL_METRICS_PORT", "65536") // Out of range for uint16 + defer os.Unsetenv("OPS_AGENT_FLUENT_BIT_METRICS_PORT") + defer os.Unsetenv("OPS_AGENT_OTEL_METRICS_PORT") + + uc := &UnifiedConfig{} + + if port := uc.GetFluentBitMetricsPort(); port != fluentbit.MetricsPort { + t.Errorf("Expected Fluent Bit default port %d for invalid env, got %d", fluentbit.MetricsPort, port) + } + + if port := uc.GetOtelMetricsPort(); port != otel.MetricsPort { + t.Errorf("Expected OTel default port %d for invalid env, got %d", otel.MetricsPort, port) + } +} diff --git a/integration_test/ops_agent_test/main_test.go b/integration_test/ops_agent_test/main_test.go index 43bd7f670f..654a9718ad 100644 --- a/integration_test/ops_agent_test/main_test.go +++ b/integration_test/ops_agent_test/main_test.go @@ -6131,6 +6131,87 @@ func TestUninstallRemovesService(t *testing.T) { }) } +func TestMetricsPortOverrideEnv(t *testing.T) { + t.Parallel() + RunForEachImageAndFeatureFlag(t, []string{agents.OtlpHttpExporterFeatureFlag}, func(t *testing.T, imageSpec string, feature string) { + t.Parallel() + if gce.IsWindows(imageSpec) { + t.Skip("Skipping on Windows for now as it requires different environment variable setup") + } + if gce.IsOpsAgentUAPPlugin() { + t.Skip("Skipping on UAP plugin as it is not supported") + } + ctx, logger, vm := setupMainLogAndVM(t, imageSpec) + + // Setup agent with default config first + if err := agents.SetupOpsAgentWithFeatureFlag(ctx, logger, vm, "", feature); err != nil { + t.Fatal(err) + } + + // Stop the agent to avoid race conditions while setting up overrides + if _, err := gce.RunRemotely(ctx, logger, vm, "sudo systemctl stop google-cloud-ops-agent"); err != nil { + t.Fatal(err) + } + + // Set up systemd overrides for Fluent Bit + fbOverrideDir := "/etc/systemd/system/google-cloud-ops-agent-fluent-bit.service.d" + fbOverrideFile := fbOverrideDir + "/override.conf" + if _, err := gce.RunRemotely(ctx, logger, vm, fmt.Sprintf("sudo mkdir -p %s", fbOverrideDir)); err != nil { + t.Fatal(err) + } + fbOverrideContent := `[Service] +Environment="OPS_AGENT_FLUENT_BIT_METRICS_PORT=40002" +` + if _, err := gce.RunRemotely(ctx, logger, vm, fmt.Sprintf("echo '%s' | sudo tee %s", fbOverrideContent, fbOverrideFile)); err != nil { + t.Fatal(err) + } + + // Set up systemd overrides for OTel Collector + otelOverrideDir := "/etc/systemd/system/google-cloud-ops-agent-opentelemetry-collector.service.d" + otelOverrideFile := otelOverrideDir + "/override.conf" + if _, err := gce.RunRemotely(ctx, logger, vm, fmt.Sprintf("sudo mkdir -p %s", otelOverrideDir)); err != nil { + t.Fatal(err) + } + otelOverrideContent := `[Service] +Environment="OPS_AGENT_OTEL_METRICS_PORT=40001" +Environment="OPS_AGENT_FLUENT_BIT_METRICS_PORT=40002" +` + if _, err := gce.RunRemotely(ctx, logger, vm, fmt.Sprintf("echo '%s' | sudo tee %s", otelOverrideContent, otelOverrideFile)); err != nil { + t.Fatal(err) + } + + // Reload systemd and restart agent + if _, err := gce.RunRemotely(ctx, logger, vm, "sudo systemctl daemon-reload"); err != nil { + t.Fatal(err) + } + if _, err := gce.RunRemotely(ctx, logger, vm, "sudo systemctl start google-cloud-ops-agent"); err != nil { + t.Fatal(err) + } + + // Wait for agent to start up + time.Sleep(20 * time.Second) + + // Verify that we can scrape metrics from the new ports + // Fluent Bit metrics on 40002 + fbMetricsOut, err := gce.RunRemotely(ctx, logger, vm, "curl -s localhost:40002/metrics") + if err != nil { + t.Fatalf("Failed to scrape Fluent Bit metrics on port 40002: %v", err) + } + if !strings.Contains(fbMetricsOut.Stdout, "fluentbit_uptime") { + t.Fatalf("Fluent Bit metrics on port 40002 do not contain expected content. Output: %s", fbMetricsOut.Stdout) + } + + // OTel Collector metrics on 40001 + otelMetricsOut, err := gce.RunRemotely(ctx, logger, vm, "curl -s localhost:40001/metrics") + if err != nil { + t.Fatalf("Failed to scrape OTel Collector metrics on port 40001: %v", err) + } + if !strings.Contains(otelMetricsOut.Stdout, "otelcol_") { + t.Fatalf("OTel Collector metrics on port 40001 do not contain expected content. Output: %s", otelMetricsOut.Stdout) + } + }) +} + func TestMain(m *testing.M) { code := m.Run() gce.CleanupKeysOrDie()