Skip to content

Commit bc941de

Browse files
committed
Add URL sanitization feature to redactionprocessor
Signed-off-by: Israel Blancas <[email protected]>
1 parent 1f8e3c4 commit bc941de

File tree

9 files changed

+330
-0
lines changed

9 files changed

+330
-0
lines changed

.chloggen/41535.yaml

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Use this changelog template to create an entry for release notes.
2+
3+
# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
4+
change_type: enhancement
5+
6+
# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver)
7+
component: redactionprocessor
8+
9+
# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
10+
note: "Add support for URL sanitization in the redaction processor."
11+
12+
# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
13+
issues: [41535]
14+
15+
# (Optional) One or more lines of additional information to render under the primary note.
16+
# These lines will be padded with 2 spaces and then inserted directly into the document.
17+
# Use pipe (|) for multiline entries.
18+
subtext:
19+
20+
# If your change doesn't affect end users or the exported elements of any package,
21+
# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.
22+
# Optional: The change log or logs in which this entry should be included.
23+
# e.g. '[user]' or '[user, api]'
24+
# Include 'user' if the change is relevant to end users.
25+
# Include 'api' if there is a change to a library API.
26+
# Default: '[user]'
27+
change_logs: []

processor/redactionprocessor/README.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,12 @@ processors:
102102
# - `info` includes just the redacted key counts in the summary
103103
# - `silent` omits the summary attributes
104104
summary: debug
105+
# url_sanitization configures URL sanitization to remove variable elements from the url, causing high cardinality issues
106+
url_sanitization:
107+
# enabled controls whether URL sanitization is active
108+
enabled: true
109+
# attributes is a list of attribute keys that contain URLs to be sanitized
110+
attributes: ["http.url", "url"]
105111
```
106112
107113
Refer to [config.yaml](./testdata/config.yaml) for how to fit the configuration
@@ -130,6 +136,8 @@ instead of masking them with a fixed string. By default, no hash function is use
130136
and masking with a fixed string is performed. The supported hash functions
131137
are `md5`, `sha1` and `sha3` (SHA-256).
132138

139+
The `url_sanitization` configuration enables sanitization of URLs in specified attributes by removing potentially sensitive information like UUIDs, timestamps, and other non-essential path segments. When enabled, it processes URLs to retain only a configured maximum number of path segments and replaces the rest with a specified string. This is particularly useful for reducing cardinality in telemetry data while preserving the essential parts of URLs for troubleshooting.
140+
133141
For example, if `notes` is on the list of allowed keys, then the `notes`
134142
attribute is retained. However, if there is a value such as a credit card
135143
number in the `notes` field that matched a regular expression on the list of

processor/redactionprocessor/config.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"strings"
1111

1212
"github.com/open-telemetry/opentelemetry-collector-contrib/processor/redactionprocessor/internal/db"
13+
"github.com/open-telemetry/opentelemetry-collector-contrib/processor/redactionprocessor/internal/url"
1314
)
1415

1516
var _ encoding.TextUnmarshaler = (*HashFunction)(nil)
@@ -69,6 +70,9 @@ type Config struct {
6970
// information, while it is valuable when integrating and testing a new
7071
// configuration. Possible values are `debug`, `info`, and `silent`.
7172
Summary string `mapstructure:"summary"`
73+
74+
// URLSanitization is a flag to sanitize URLs by removing UUIDs, timestamps, and other non-essential information
75+
URLSanitization url.URLSanitizationConfig `mapstructure:"url_sanitization"`
7276
}
7377

7478
func (u HashFunction) String() string {

processor/redactionprocessor/go.mod

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ go 1.24
44

55
require (
66
github.com/DataDog/datadog-agent/pkg/obfuscate v0.69.4
7+
github.com/grafana/clusterurl v0.0.0-20250902185539-d4367e922036
78
github.com/stretchr/testify v1.10.0
89
go.opentelemetry.io/collector/component v1.40.0
910
go.opentelemetry.io/collector/component/componenttest v0.134.0
@@ -38,6 +39,7 @@ require (
3839
github.com/gogo/protobuf v1.3.2 // indirect
3940
github.com/google/uuid v1.6.0 // indirect
4041
github.com/hashicorp/go-version v1.7.0 // indirect
42+
github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect
4143
github.com/json-iterator/go v1.1.12 // indirect
4244
github.com/knadh/koanf/maps v0.1.2 // indirect
4345
github.com/knadh/koanf/providers/confmap v1.0.0 // indirect
@@ -78,6 +80,8 @@ require (
7880
gopkg.in/yaml.v3 v3.0.1 // indirect
7981
)
8082

83+
replace github.com/grafana/clusterurl => github.com/grafana/clusterurl v0.0.0-20250902185539-d4367e922036
84+
8185
retract (
8286
v0.76.2
8387
v0.76.1

processor/redactionprocessor/go.sum

Lines changed: 4 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
// Copyright The OpenTelemetry Authors
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
package url // import "github.com/open-telemetry/opentelemetry-collector-contrib/processor/redactionprocessor/internal/url"
5+
6+
type URLSanitizationConfig struct {
7+
Enabled bool `mapstructure:"enabled"`
8+
// Attributes is the list of attributes that will be sanitized.
9+
Attributes []string `mapstructure:"attributes"`
10+
}
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
// Copyright The OpenTelemetry Authors
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
package url // import "github.com/open-telemetry/opentelemetry-collector-contrib/processor/redactionprocessor/internal/url"
5+
6+
import (
7+
"fmt"
8+
9+
"github.com/grafana/clusterurl/pkg/clusterurl"
10+
)
11+
12+
type URLSanitizer struct {
13+
classifier *clusterurl.ClusterURLClassifier
14+
attributes map[string]bool
15+
}
16+
17+
func NewURLSanitizer(config URLSanitizationConfig) (*URLSanitizer, error) {
18+
classifier, err := clusterurl.NewClusterURLClassifier(nil)
19+
if err != nil {
20+
return nil, fmt.Errorf("unable to create cluster URL classifier: %w", err)
21+
}
22+
23+
attributes := make(map[string]bool)
24+
for _, attribute := range config.Attributes {
25+
attributes[attribute] = true
26+
}
27+
28+
return &URLSanitizer{
29+
classifier: classifier,
30+
attributes: attributes,
31+
}, nil
32+
}
33+
34+
func (s *URLSanitizer) SanitizeAttributeURL(url, attributeKey string) string {
35+
if url == "" {
36+
return url
37+
}
38+
39+
if _, ok := s.attributes[attributeKey]; ok {
40+
return s.SanitizeURL(url)
41+
}
42+
43+
return url
44+
}
45+
46+
// SanitizeURL sanitizes the given URL by removing any gibberish words.
47+
// https://github.com/open-telemetry/opentelemetry-ebpf-instrumentation/blob/38ca7938595409b8ffe6b897c14a0e3280dd2941/pkg/components/transform/route/cluster.go#L48
48+
func (s *URLSanitizer) SanitizeURL(url string) string {
49+
return s.classifier.ClusterURL(url)
50+
}

processor/redactionprocessor/processor.go

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"golang.org/x/crypto/sha3"
2424

2525
"github.com/open-telemetry/opentelemetry-collector-contrib/processor/redactionprocessor/internal/db"
26+
"github.com/open-telemetry/opentelemetry-collector-contrib/processor/redactionprocessor/internal/url"
2627
)
2728

2829
const attrValuesSeparator = ","
@@ -44,6 +45,8 @@ type redaction struct {
4445
config *Config
4546
// Logger
4647
logger *zap.Logger
48+
// URL sanitizer
49+
urlSanitizer *url.URLSanitizer
4750
// Database obfuscator
4851
dbObfuscator *db.Obfuscator
4952
}
@@ -69,6 +72,13 @@ func newRedaction(ctx context.Context, config *Config, logger *zap.Logger) (*red
6972
return nil, fmt.Errorf("failed to process allow list: %w", err)
7073
}
7174

75+
var urlSanitizer *url.URLSanitizer
76+
if config.URLSanitization.Enabled {
77+
urlSanitizer, err = url.NewURLSanitizer(config.URLSanitization)
78+
if err != nil {
79+
return nil, fmt.Errorf("failed to create URL sanitizer: %w", err)
80+
}
81+
}
7282
dbObfuscator := db.NewObfuscator(config.DBSanitizer)
7383

7484
return &redaction{
@@ -80,6 +90,7 @@ func newRedaction(ctx context.Context, config *Config, logger *zap.Logger) (*red
8090
hashFunction: config.HashFunction,
8191
config: config,
8292
logger: logger,
93+
urlSanitizer: urlSanitizer,
8394
dbObfuscator: dbObfuscator,
8495
}, nil
8596
}
@@ -129,6 +140,10 @@ func (s *redaction) processResourceSpan(ctx context.Context, rs ptrace.ResourceS
129140

130141
// Attributes can also be part of span events
131142
s.processSpanEvents(ctx, span.Events())
143+
144+
if s.urlSanitizer != nil {
145+
span.SetName(s.urlSanitizer.SanitizeURL(span.Name()))
146+
}
132147
}
133148
}
134149
}
@@ -401,6 +416,10 @@ func (s *redaction) processStringValueForAttribute(strVal, attributeKey string)
401416
}
402417
}
403418

419+
if s.urlSanitizer != nil {
420+
strVal = s.urlSanitizer.SanitizeAttributeURL(strVal, attributeKey)
421+
}
422+
404423
if s.dbObfuscator != nil {
405424
obfuscatedQuery, err := s.dbObfuscator.ObfuscateAttribute(strVal, attributeKey)
406425
if err != nil {
@@ -421,6 +440,10 @@ func (s *redaction) processStringValueForLogBody(strVal string) string {
421440
}
422441
}
423442

443+
if s.urlSanitizer != nil {
444+
strVal = s.urlSanitizer.SanitizeURL(strVal)
445+
}
446+
424447
if s.dbObfuscator != nil {
425448
obfuscatedQuery, err := s.dbObfuscator.Obfuscate(strVal)
426449
if err != nil {

0 commit comments

Comments
 (0)