Skip to content

Commit b83f9cd

Browse files
committed
Add URL sanitization feature to redactionprocessor
Signed-off-by: Israel Blancas <[email protected]>
1 parent 791d30d commit b83f9cd

File tree

9 files changed

+411
-0
lines changed

9 files changed

+411
-0
lines changed

.chloggen/41535.yaml

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Use this changelog template to create an entry for release notes.
2+
3+
# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
4+
change_type: enhancement
5+
6+
# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver)
7+
component: redactionprocessor
8+
9+
# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
10+
note: "Add support for URL sanitization in the redaction processor."
11+
12+
# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
13+
issues: [41535]
14+
15+
# (Optional) One or more lines of additional information to render under the primary note.
16+
# These lines will be padded with 2 spaces and then inserted directly into the document.
17+
# Use pipe (|) for multiline entries.
18+
subtext:
19+
20+
# If your change doesn't affect end users or the exported elements of any package,
21+
# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.
22+
# Optional: The change log or logs in which this entry should be included.
23+
# e.g. '[user]' or '[user, api]'
24+
# Include 'user' if the change is relevant to end users.
25+
# Include 'api' if there is a change to a library API.
26+
# Default: '[user]'
27+
change_logs: []

processor/redactionprocessor/README.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,20 @@ processors:
102102
# - `info` includes just the redacted key counts in the summary
103103
# - `silent` omits the summary attributes
104104
summary: debug
105+
# url_sanitization configures URL sanitization to remove variable elements from the url, causing high cardinality issues
106+
url_sanitization:
107+
# enabled controls whether URL sanitization is active
108+
enabled: true
109+
# attributes is a list of attribute keys that contain URLs to be sanitized
110+
attributes: ["http.url", "url"]
111+
# max_segments is the maximum number of path segments to retain in the URL
112+
max_segments: 4
113+
# cache_size is used to cache valid elements from the url
114+
cache_size: 1000
115+
# replace_with is the string used to replace sanitized segments in the URL
116+
replace_with: "***"
117+
# sanitize_span_name controls whether to apply URL sanitization to span names
118+
sanitize_span_name: true
105119
```
106120
107121
Refer to [config.yaml](./testdata/config.yaml) for how to fit the configuration
@@ -130,6 +144,8 @@ instead of masking them with a fixed string. By default, no hash function is use
130144
and masking with a fixed string is performed. The supported hash functions
131145
are `md5`, `sha1` and `sha3` (SHA-256).
132146

147+
The `url_sanitization` configuration enables sanitization of URLs in specified attributes by removing potentially sensitive information like UUIDs, timestamps, and other non-essential path segments. When enabled, it processes URLs to retain only a configured maximum number of path segments and replaces the rest with a specified string. This is particularly useful for reducing cardinality in telemetry data while preserving the essential parts of URLs for troubleshooting.
148+
133149
For example, if `notes` is on the list of allowed keys, then the `notes`
134150
attribute is retained. However, if there is a value such as a credit card
135151
number in the `notes` field that matched a regular expression on the list of

processor/redactionprocessor/config.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"strings"
1111

1212
"github.com/open-telemetry/opentelemetry-collector-contrib/processor/redactionprocessor/internal/db"
13+
"github.com/open-telemetry/opentelemetry-collector-contrib/processor/redactionprocessor/internal/url"
1314
)
1415

1516
var _ encoding.TextUnmarshaler = (*HashFunction)(nil)
@@ -69,6 +70,9 @@ type Config struct {
6970
// information, while it is valuable when integrating and testing a new
7071
// configuration. Possible values are `debug`, `info`, and `silent`.
7172
Summary string `mapstructure:"summary"`
73+
74+
// URLSanitization is a flag to sanitize URLs by removing UUIDs, timestamps, and other non-essential information
75+
URLSanitization url.URLSanitizationConfig `mapstructure:"url_sanitization"`
7276
}
7377

7478
func (u HashFunction) String() string {

processor/redactionprocessor/go.mod

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ go 1.24
44

55
require (
66
github.com/DataDog/datadog-agent/pkg/obfuscate v0.68.3
7+
github.com/grafana/clusterurl v0.0.0-20250807194035-5e7bb040a64c
78
github.com/stretchr/testify v1.10.0
89
go.opentelemetry.io/collector/component v1.38.0
910
go.opentelemetry.io/collector/component/componenttest v0.132.0
@@ -21,6 +22,7 @@ require (
2122
)
2223

2324
require (
25+
github.com/AlessandroPomponio/go-gibberish v0.0.0-20191004143433-a2d4156f0396 // indirect
2426
github.com/DataDog/datadog-agent/pkg/util/log v0.68.3 // indirect
2527
github.com/DataDog/datadog-agent/pkg/util/scrubber v0.68.3 // indirect
2628
github.com/DataDog/datadog-agent/pkg/version v0.68.3 // indirect
@@ -38,6 +40,7 @@ require (
3840
github.com/gogo/protobuf v1.3.2 // indirect
3941
github.com/google/uuid v1.6.0 // indirect
4042
github.com/hashicorp/go-version v1.7.0 // indirect
43+
github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect
4144
github.com/json-iterator/go v1.1.12 // indirect
4245
github.com/knadh/koanf/maps v0.1.2 // indirect
4346
github.com/knadh/koanf/providers/confmap v1.0.0 // indirect

processor/redactionprocessor/go.sum

Lines changed: 6 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
// Copyright The OpenTelemetry Authors
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
package url // import "github.com/open-telemetry/opentelemetry-collector-contrib/processor/redactionprocessor/internal/url"
5+
6+
import "errors"
7+
8+
type URLSanitizationConfig struct {
9+
Enabled bool `mapstructure:"enabled"`
10+
// MaxSegments is the maximum number of segments in a path.
11+
MaxSegments int `mapstructure:"max_segments"`
12+
// ReplaceWith is the character that will replace the segments in a path.
13+
ReplaceWith string `mapstructure:"replace_with"`
14+
// CacheSize is the size of the cache for the classifier.
15+
CacheSize int `mapstructure:"cache_size"`
16+
// Attributes is the list of attributes that will be sanitized.
17+
Attributes []string `mapstructure:"attributes"`
18+
// SanitizeSpanName is a flag to sanitize the span name.
19+
SanitizeSpanName bool `mapstructure:"sanitize_span_name"`
20+
}
21+
22+
func (u *URLSanitizationConfig) Validate() error {
23+
if len(u.ReplaceWith) > 1 {
24+
return errors.New("replace_with must be a single character")
25+
}
26+
27+
return nil
28+
}
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
// Copyright The OpenTelemetry Authors
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
package url // import "github.com/open-telemetry/opentelemetry-collector-contrib/processor/redactionprocessor/internal/url"
5+
6+
import (
7+
"fmt"
8+
9+
"github.com/grafana/clusterurl/clusterurl"
10+
)
11+
12+
type URLSanitizer struct {
13+
classifier *clusterurl.ClusterURLClassifier
14+
attributes map[string]bool
15+
}
16+
17+
func NewURLSanitizer(config URLSanitizationConfig) (*URLSanitizer, error) {
18+
classifierConfig := clusterurl.DefaultConfig()
19+
if config.CacheSize > 0 {
20+
classifierConfig.CacheSize = config.CacheSize
21+
}
22+
if config.MaxSegments > 0 {
23+
classifierConfig.MaxSegments = config.MaxSegments
24+
}
25+
if config.ReplaceWith != "" {
26+
classifierConfig.ReplaceWith = config.ReplaceWith[0]
27+
}
28+
29+
classifier, err := clusterurl.NewClusterURLClassifier(classifierConfig)
30+
if err != nil {
31+
return nil, fmt.Errorf("unable to create cluster URL classifier: %w", err)
32+
}
33+
34+
attributes := make(map[string]bool)
35+
for _, attribute := range config.Attributes {
36+
attributes[attribute] = true
37+
}
38+
39+
return &URLSanitizer{
40+
classifier: classifier,
41+
attributes: attributes,
42+
}, nil
43+
}
44+
45+
func (s *URLSanitizer) SanitizeAttributeURL(url, attributeKey string) string {
46+
if url == "" {
47+
return url
48+
}
49+
50+
if _, ok := s.attributes[attributeKey]; ok {
51+
return s.SanitizeURL(url)
52+
}
53+
54+
return url
55+
}
56+
57+
// SanitizeURL sanitizes the given URL by removing any gibberish words.
58+
// https://github.com/open-telemetry/opentelemetry-ebpf-instrumentation/blob/38ca7938595409b8ffe6b897c14a0e3280dd2941/pkg/components/transform/route/cluster.go#L48
59+
func (s *URLSanitizer) SanitizeURL(url string) string {
60+
return s.classifier.ClusterURL(url)
61+
}

processor/redactionprocessor/processor.go

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"golang.org/x/crypto/sha3"
2424

2525
"github.com/open-telemetry/opentelemetry-collector-contrib/processor/redactionprocessor/internal/db"
26+
"github.com/open-telemetry/opentelemetry-collector-contrib/processor/redactionprocessor/internal/url"
2627
)
2728

2829
const attrValuesSeparator = ","
@@ -44,6 +45,8 @@ type redaction struct {
4445
config *Config
4546
// Logger
4647
logger *zap.Logger
48+
// URL sanitizer
49+
urlSanitizer *url.URLSanitizer
4750
// Database obfuscator
4851
dbObfuscator *db.Obfuscator
4952
}
@@ -69,6 +72,13 @@ func newRedaction(ctx context.Context, config *Config, logger *zap.Logger) (*red
6972
return nil, fmt.Errorf("failed to process allow list: %w", err)
7073
}
7174

75+
var urlSanitizer *url.URLSanitizer
76+
if config.URLSanitization.Enabled {
77+
urlSanitizer, err = url.NewURLSanitizer(config.URLSanitization)
78+
if err != nil {
79+
return nil, fmt.Errorf("failed to create URL sanitizer: %w", err)
80+
}
81+
}
7282
dbObfuscator := db.NewObfuscator(config.DBSanitizer)
7383

7484
return &redaction{
@@ -80,6 +90,7 @@ func newRedaction(ctx context.Context, config *Config, logger *zap.Logger) (*red
8090
hashFunction: config.HashFunction,
8191
config: config,
8292
logger: logger,
93+
urlSanitizer: urlSanitizer,
8394
dbObfuscator: dbObfuscator,
8495
}, nil
8596
}
@@ -129,6 +140,10 @@ func (s *redaction) processResourceSpan(ctx context.Context, rs ptrace.ResourceS
129140

130141
// Attributes can also be part of span events
131142
s.processSpanEvents(ctx, span.Events())
143+
144+
if s.config.URLSanitization.SanitizeSpanName && s.urlSanitizer != nil {
145+
span.SetName(s.urlSanitizer.SanitizeURL(span.Name()))
146+
}
132147
}
133148
}
134149
}
@@ -401,6 +416,10 @@ func (s *redaction) processStringValueForAttribute(strVal, attributeKey string)
401416
}
402417
}
403418

419+
if s.urlSanitizer != nil {
420+
strVal = s.urlSanitizer.SanitizeAttributeURL(strVal, attributeKey)
421+
}
422+
404423
if s.dbObfuscator != nil {
405424
obfuscatedQuery, err := s.dbObfuscator.ObfuscateAttribute(strVal, attributeKey)
406425
if err != nil {
@@ -421,6 +440,10 @@ func (s *redaction) processStringValueForLogBody(strVal string) string {
421440
}
422441
}
423442

443+
if s.urlSanitizer != nil {
444+
strVal = s.urlSanitizer.SanitizeURL(strVal)
445+
}
446+
424447
if s.dbObfuscator != nil {
425448
obfuscatedQuery, err := s.dbObfuscator.Obfuscate(strVal)
426449
if err != nil {

0 commit comments

Comments
 (0)