Skip to content

Commit b355d8e

Browse files
authored
[processor/redactionprocessor] Add URL sanitization feature to redactionprocessor (#41774)
<!--Ex. Fixing a bug - Describe the bug and how this fixes the issue. Ex. Adding a feature - Explain what this achieves.--> #### Description Add URL sanitization feature to redactionprocessor <!-- Issue number (e.g. #1234) or full URL to issue, if applicable. --> #### Link to tracking issue Fixes #41535 <!--Describe what testing was performed and which tests were added.--> #### Testing - Added new tests - Tested with this config: ```yaml receivers: otlp: protocols: grpc: http: exporters: debug: verbosity: detailed processors: redaction: allow_all_keys: true url_sanitization: enabled: true attributes: - "url" service: pipelines: logs: receivers: [otlp] processors: [redaction] exporters: [debug] traces: receivers: [otlp] processors: [redaction] exporters: [debug] metrics: receivers: [otlp] processors: [redaction] exporters: [debug] ``` Signed-off-by: Israel Blancas <[email protected]>
1 parent 61aeb11 commit b355d8e

File tree

9 files changed

+531
-0
lines changed

9 files changed

+531
-0
lines changed

.chloggen/41535.yaml

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Use this changelog template to create an entry for release notes.
2+
3+
# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
4+
change_type: enhancement
5+
6+
# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver)
7+
component: redactionprocessor
8+
9+
# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
10+
note: "Add support for URL sanitization in the redaction processor."
11+
12+
# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
13+
issues: [41535]
14+
15+
# (Optional) One or more lines of additional information to render under the primary note.
16+
# These lines will be padded with 2 spaces and then inserted directly into the document.
17+
# Use pipe (|) for multiline entries.
18+
subtext:
19+
20+
# If your change doesn't affect end users or the exported elements of any package,
21+
# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.
22+
# Optional: The change log or logs in which this entry should be included.
23+
# e.g. '[user]' or '[user, api]'
24+
# Include 'user' if the change is relevant to end users.
25+
# Include 'api' if there is a change to a library API.
26+
# Default: '[user]'
27+
change_logs: []

processor/redactionprocessor/README.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,12 @@ processors:
102102
# - `info` includes just the redacted key counts in the summary
103103
# - `silent` omits the summary attributes
104104
summary: debug
105+
# url_sanitizer configures URL sanitization to remove variable elements from the url, causing high cardinality issues
106+
url_sanitizer:
107+
# enabled controls whether URL sanitization is active
108+
enabled: true
109+
# attributes is a list of attribute keys that contain URLs to be sanitized
110+
attributes: ["http.url", "url"]
105111
```
106112
107113
Refer to [config.yaml](./testdata/config.yaml) for how to fit the configuration
@@ -130,6 +136,10 @@ instead of masking them with a fixed string. By default, no hash function is use
130136
and masking with a fixed string is performed. The supported hash functions
131137
are `md5`, `sha1` and `sha3` (SHA-256).
132138

139+
The `url_sanitizer` configuration enables sanitization of URLs in specified attributes by removing potentially sensitive information like UUIDs, timestamps, and other non-essential path segments. This is particularly useful for reducing cardinality in telemetry data while preserving the essential parts of URLs for troubleshooting.
140+
141+
Additionally, URL sanitization automatically applies to span names for client and server span types that contain "/" characters. This helps reduce cardinality issues caused by high-variability URL paths in span names while preserving the essential routing information needed for observability.
142+
133143
For example, if `notes` is on the list of allowed keys, then the `notes`
134144
attribute is retained. However, if there is a value such as a credit card
135145
number in the `notes` field that matched a regular expression on the list of

processor/redactionprocessor/config.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"strings"
1111

1212
"github.com/open-telemetry/opentelemetry-collector-contrib/processor/redactionprocessor/internal/db"
13+
"github.com/open-telemetry/opentelemetry-collector-contrib/processor/redactionprocessor/internal/url"
1314
)
1415

1516
var _ encoding.TextUnmarshaler = (*HashFunction)(nil)
@@ -69,6 +70,9 @@ type Config struct {
6970
// information, while it is valuable when integrating and testing a new
7071
// configuration. Possible values are `debug`, `info`, and `silent`.
7172
Summary string `mapstructure:"summary"`
73+
74+
// URLSanitization is a flag to sanitize URLs by removing UUIDs, timestamps, and other non-essential information
75+
URLSanitization url.URLSanitizationConfig `mapstructure:"url_sanitizer"`
7276
}
7377

7478
func (u HashFunction) String() string {

processor/redactionprocessor/go.mod

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ go 1.24.0
44

55
require (
66
github.com/DataDog/datadog-agent/pkg/obfuscate v0.70.2
7+
github.com/grafana/clusterurl v0.2.1
78
github.com/stretchr/testify v1.11.1
89
go.opentelemetry.io/collector/component v1.42.0
910
go.opentelemetry.io/collector/component/componenttest v0.136.0
@@ -38,6 +39,7 @@ require (
3839
github.com/gogo/protobuf v1.3.2 // indirect
3940
github.com/google/uuid v1.6.0 // indirect
4041
github.com/hashicorp/go-version v1.7.0 // indirect
42+
github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect
4143
github.com/json-iterator/go v1.1.12 // indirect
4244
github.com/knadh/koanf/maps v0.1.2 // indirect
4345
github.com/knadh/koanf/providers/confmap v1.0.0 // indirect

processor/redactionprocessor/go.sum

Lines changed: 4 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
// Copyright The OpenTelemetry Authors
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
package url // import "github.com/open-telemetry/opentelemetry-collector-contrib/processor/redactionprocessor/internal/url"
5+
6+
type URLSanitizationConfig struct {
7+
Enabled bool `mapstructure:"enabled"`
8+
// Attributes is the list of attributes that will be sanitized.
9+
Attributes []string `mapstructure:"attributes"`
10+
}
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
// Copyright The OpenTelemetry Authors
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
package url // import "github.com/open-telemetry/opentelemetry-collector-contrib/processor/redactionprocessor/internal/url"
5+
6+
import (
7+
"fmt"
8+
9+
"github.com/grafana/clusterurl/pkg/clusterurl"
10+
)
11+
12+
type URLSanitizer struct {
13+
classifier *clusterurl.ClusterURLClassifier
14+
attributes map[string]bool
15+
}
16+
17+
func NewURLSanitizer(config URLSanitizationConfig) (*URLSanitizer, error) {
18+
classifier, err := clusterurl.NewClusterURLClassifier(nil)
19+
if err != nil {
20+
return nil, fmt.Errorf("unable to create cluster URL classifier: %w", err)
21+
}
22+
23+
attributes := make(map[string]bool)
24+
for _, attribute := range config.Attributes {
25+
attributes[attribute] = true
26+
}
27+
28+
return &URLSanitizer{
29+
classifier: classifier,
30+
attributes: attributes,
31+
}, nil
32+
}
33+
34+
func (s *URLSanitizer) SanitizeAttributeURL(url, attributeKey string) string {
35+
if url == "" {
36+
return url
37+
}
38+
39+
if _, ok := s.attributes[attributeKey]; ok {
40+
return s.SanitizeURL(url)
41+
}
42+
43+
return url
44+
}
45+
46+
// SanitizeURL sanitizes the given URL by removing any gibberish words.
47+
// https://github.com/open-telemetry/opentelemetry-ebpf-instrumentation/blob/38ca7938595409b8ffe6b897c14a0e3280dd2941/pkg/components/transform/route/cluster.go#L48
48+
func (s *URLSanitizer) SanitizeURL(url string) string {
49+
return s.classifier.ClusterURL(url)
50+
}

processor/redactionprocessor/processor.go

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"golang.org/x/crypto/sha3"
2424

2525
"github.com/open-telemetry/opentelemetry-collector-contrib/processor/redactionprocessor/internal/db"
26+
"github.com/open-telemetry/opentelemetry-collector-contrib/processor/redactionprocessor/internal/url"
2627
)
2728

2829
const attrValuesSeparator = ","
@@ -44,6 +45,8 @@ type redaction struct {
4445
config *Config
4546
// Logger
4647
logger *zap.Logger
48+
// URL sanitizer
49+
urlSanitizer *url.URLSanitizer
4750
// Database obfuscator
4851
dbObfuscator *db.Obfuscator
4952
}
@@ -69,6 +72,13 @@ func newRedaction(ctx context.Context, config *Config, logger *zap.Logger) (*red
6972
return nil, fmt.Errorf("failed to process allow list: %w", err)
7073
}
7174

75+
var urlSanitizer *url.URLSanitizer
76+
if config.URLSanitization.Enabled {
77+
urlSanitizer, err = url.NewURLSanitizer(config.URLSanitization)
78+
if err != nil {
79+
return nil, fmt.Errorf("failed to create URL sanitizer: %w", err)
80+
}
81+
}
7282
dbObfuscator := db.NewObfuscator(config.DBSanitizer)
7383

7484
return &redaction{
@@ -80,6 +90,7 @@ func newRedaction(ctx context.Context, config *Config, logger *zap.Logger) (*red
8090
hashFunction: config.HashFunction,
8191
config: config,
8292
logger: logger,
93+
urlSanitizer: urlSanitizer,
8394
dbObfuscator: dbObfuscator,
8495
}, nil
8596
}
@@ -131,6 +142,10 @@ func (s *redaction) processResourceSpan(ctx context.Context, rs ptrace.ResourceS
131142

132143
// Attributes can also be part of span events
133144
s.processSpanEvents(ctx, span.Events())
145+
146+
if s.shouldRedactSpanName(&span) {
147+
span.SetName(s.urlSanitizer.SanitizeURL(span.Name()))
148+
}
134149
}
135150
}
136151
}
@@ -407,6 +422,10 @@ func (s *redaction) processStringValueForAttribute(strVal, attributeKey string)
407422
}
408423
}
409424

425+
if s.urlSanitizer != nil {
426+
strVal = s.urlSanitizer.SanitizeAttributeURL(strVal, attributeKey)
427+
}
428+
410429
if s.dbObfuscator != nil {
411430
obfuscatedQuery, err := s.dbObfuscator.ObfuscateAttribute(strVal, attributeKey)
412431
if err != nil {
@@ -427,6 +446,10 @@ func (s *redaction) processStringValueForLogBody(strVal string) string {
427446
}
428447
}
429448

449+
if s.urlSanitizer != nil {
450+
strVal = s.urlSanitizer.SanitizeURL(strVal)
451+
}
452+
430453
if s.dbObfuscator != nil {
431454
obfuscatedQuery, err := s.dbObfuscator.Obfuscate(strVal)
432455
if err != nil {
@@ -474,6 +497,22 @@ func (s *redaction) shouldRedactKey(k string) bool {
474497
return false
475498
}
476499

500+
func (s *redaction) shouldRedactSpanName(span *ptrace.Span) bool {
501+
if s.urlSanitizer == nil {
502+
return false
503+
}
504+
spanKind := span.Kind()
505+
if spanKind != ptrace.SpanKindClient && spanKind != ptrace.SpanKindServer {
506+
return false
507+
}
508+
509+
spanName := span.Name()
510+
if !strings.Contains(spanName, "/") {
511+
return false
512+
}
513+
return !s.shouldAllowValue(spanName)
514+
}
515+
477516
const (
478517
debug = "debug"
479518
info = "info"

0 commit comments

Comments
 (0)