Skip to content

Commit 637d90a

Browse files
committed
feat(tag_cardinality_limit transform): add exact_fingerprint mode for lower memory
Introduces Mode::ExactFingerprint (YAML: mode: exact_fingerprint), an opt-in storage mode that reduces per-accepted-value memory from ~128 B to ~9 B by storing 64-bit hash fingerprints of tag values instead of the full strings. Design choices: - Stores only u64 fingerprints; accepts a vanishingly small collision risk (≈ 7e-15 per set at the default value_limit=500), which can cause a minor cardinality undercount. Mode::Exact remains byte-exact for users who need it. - Fingerprints are computed with the std DefaultHasher (stateless, fixed keys, no per-set hasher state) — the same hasher TagValueSet's own Hash impl uses internally. - Fingerprint table uses HashBuildHasher (identity/pass-through hasher) to avoid double-hashing an already-uniformly-distributed u64. - Mode::ExactFingerprint and OverrideMode::ExactFingerprint are new, user-visible config variants. Existing Mode::Exact semantics are completely unchanged. Also fixes test_accepted_tag_value_set_probabilistic in tag_value_set.rs, which was erroneously constructing Mode::Exact and therefore not testing the Bloom path at all. Benchmarked on a local release binary across M=50K/100K, T=10/50, V=1/10/100. Memory reduction vs exact mode: 36-46% at V=1, 65-75% at V=10, 85-88% at V=100. See tcl_memtest/SESSION_NOTES_2026-06-12.md for full results. Co-authored-by: ArunPiduguDD <arun.pidugu@datadoghq.com>
1 parent 0bcb4c6 commit 637d90a

6 files changed

Lines changed: 291 additions & 4 deletions

File tree

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
The `tag_cardinality_limit` transform now supports `mode: exact_fingerprint`, a new storage
2+
mode that can reduce memory usage for high-cardinality tag values compared to
3+
`mode: exact`. Instead of storing the full tag-value strings, only a 64 bit fingerprint hash of
4+
each value is kept. The trade-off of this approach is that there throughput is slightly impacted
5+
due to extra hashing operations, and there is technically a chance of collisions at very high
6+
cardinalities (albeit very unlikely)
7+
8+
authors: ArunPiduguDD

src/transforms/tag_cardinality_limit/config.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,13 @@ pub enum Mode {
114114
/// metrics with new tags after the limit has been hit.
115115
Exact,
116116

117+
/// This mode operates similarly to `exact` mode except it tracks cardinality using 64-bit hash fingerprints
118+
/// of tag values instead of the original strings. This leads to lower memory requirements in most
119+
/// scenarios (assuming average tag value size is greater than 8 bytes) at the cost of slightly
120+
/// reduced throughput due to extra hashing operations and a very small chance of collisions at
121+
/// very high cardinalities
122+
ExactFingerprint,
123+
117124
/// Tracks cardinality probabilistically.
118125
///
119126
/// This mode has lower memory requirements than `exact`, but may occasionally allow metric
@@ -183,6 +190,9 @@ pub enum OverrideMode {
183190
/// Tracks cardinality exactly. See `Mode::Exact` for details.
184191
Exact,
185192

193+
/// Tracks cardinality using 64-bit hash fingerprints. See `Mode::ExactFingerprint` for details.
194+
ExactFingerprint,
195+
186196
/// Tracks cardinality probabilistically. See `Mode::Probabilistic` for details.
187197
Probabilistic(BloomFilterConfig),
188198

@@ -196,6 +206,7 @@ impl OverrideMode {
196206
pub const fn as_mode(&self) -> Option<Mode> {
197207
match self {
198208
OverrideMode::Exact => Some(Mode::Exact),
209+
OverrideMode::ExactFingerprint => Some(Mode::ExactFingerprint),
199210
OverrideMode::Probabilistic(b) => Some(Mode::Probabilistic(*b)),
200211
OverrideMode::Excluded => None,
201212
}

src/transforms/tag_cardinality_limit/tag_value_set.rs

Lines changed: 109 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,25 @@
1-
use std::{collections::HashSet, fmt};
1+
use std::{
2+
collections::HashSet,
3+
fmt,
4+
hash::{BuildHasher, BuildHasherDefault},
5+
};
26

37
use bloomy::BloomFilter;
8+
use hash_hasher::HashedSet;
9+
use seahash::SeaHasher;
410

511
use crate::{event::metric::TagValueSet, transforms::tag_cardinality_limit::config::Mode};
612

713
/// Container for storing the set of accepted values for a given tag key.
14+
///
15+
/// # Storage backend selection
16+
///
17+
/// | `Mode` | Storage |
18+
/// |----------------------|---------------------------------|
19+
/// | `Exact` | `HashSet<TagValueSet>` |
20+
/// | `ExactFingerprint` | `HashSet<u64>` (fingerprints) |
21+
/// | `Probabilistic` | `BloomFilter |
22+
823
#[derive(Debug)]
924
pub struct AcceptedTagValueSet {
1025
storage: TagValueSetStorage,
@@ -13,6 +28,8 @@ pub struct AcceptedTagValueSet {
1328
enum TagValueSetStorage {
1429
Set(HashSet<TagValueSet>),
1530
Bloom(BloomFilterStorage),
31+
/// Stores 64-bit hash fingerprints of accepted tag values
32+
Fingerprint(FingerprintStorage),
1633
}
1734

1835
/// A bloom filter that tracks the number of items inserted into it.
@@ -49,19 +66,51 @@ impl BloomFilterStorage {
4966
}
5067
}
5168

69+
struct FingerprintStorage {
70+
fps: HashedSet<u64>,
71+
}
72+
73+
impl FingerprintStorage {
74+
fn new() -> Self {
75+
Self {
76+
fps: HashedSet::default(),
77+
}
78+
}
79+
80+
/// Compute a 64-bit fingerprint of a tag value
81+
fn fingerprint(value: &TagValueSet) -> u64 {
82+
BuildHasherDefault::<SeaHasher>::default().hash_one(value)
83+
}
84+
85+
fn insert(&mut self, value: &TagValueSet) {
86+
self.fps.insert(Self::fingerprint(value));
87+
}
88+
89+
fn contains(&self, value: &TagValueSet) -> bool {
90+
self.fps.contains(&Self::fingerprint(value))
91+
}
92+
93+
fn len(&self) -> usize {
94+
self.fps.len()
95+
}
96+
}
97+
5298
impl fmt::Debug for TagValueSetStorage {
5399
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
54100
match self {
55101
TagValueSetStorage::Set(set) => write!(f, "Set({set:?})"),
56102
TagValueSetStorage::Bloom(_) => write!(f, "Bloom"),
103+
TagValueSetStorage::Fingerprint(_) => write!(f, "Fingerprint"),
57104
}
58105
}
59106
}
60107

61108
impl AcceptedTagValueSet {
109+
/// Create a new `AcceptedTagValueSet` for the given mode.
62110
pub fn new(mode: &Mode) -> Self {
63111
let storage = match &mode {
64112
Mode::Exact => TagValueSetStorage::Set(HashSet::new()),
113+
Mode::ExactFingerprint => TagValueSetStorage::Fingerprint(FingerprintStorage::new()),
65114
Mode::Probabilistic(config) => {
66115
TagValueSetStorage::Bloom(BloomFilterStorage::new(config.cache_size_per_key))
67116
}
@@ -73,13 +122,15 @@ impl AcceptedTagValueSet {
73122
match &self.storage {
74123
TagValueSetStorage::Set(set) => set.contains(value),
75124
TagValueSetStorage::Bloom(bloom) => bloom.contains(value),
125+
TagValueSetStorage::Fingerprint(fp) => fp.contains(value),
76126
}
77127
}
78128

79129
pub fn len(&self) -> usize {
80130
match &self.storage {
81131
TagValueSetStorage::Set(set) => set.len(),
82132
TagValueSetStorage::Bloom(bloom) => bloom.count(),
133+
TagValueSetStorage::Fingerprint(fp) => fp.len(),
83134
}
84135
}
85136

@@ -89,14 +140,18 @@ impl AcceptedTagValueSet {
89140
set.insert(value);
90141
}
91142
TagValueSetStorage::Bloom(bloom) => bloom.insert(&value),
143+
TagValueSetStorage::Fingerprint(fp) => fp.insert(&value),
92144
};
93145
}
94146
}
95147

96148
#[cfg(test)]
97149
mod tests {
98150
use super::*;
99-
use crate::{event::metric::TagValueSet, transforms::tag_cardinality_limit::config::Mode};
151+
use crate::{
152+
event::metric::TagValueSet,
153+
transforms::tag_cardinality_limit::config::{BloomFilterConfig, Mode},
154+
};
100155

101156
#[test]
102157
fn test_accepted_tag_value_set_exact() {
@@ -116,7 +171,11 @@ mod tests {
116171

117172
#[test]
118173
fn test_accepted_tag_value_set_probabilistic() {
119-
let mut accepted_tag_value_set = AcceptedTagValueSet::new(&Mode::Exact);
174+
// Previously this test mistakenly constructed Mode::Exact; fixed to use Probabilistic.
175+
let mut accepted_tag_value_set =
176+
AcceptedTagValueSet::new(&Mode::Probabilistic(BloomFilterConfig {
177+
cache_size_per_key: 5 * 1024,
178+
}));
120179

121180
assert!(!accepted_tag_value_set.contains(&TagValueSet::from(["value1".to_string()])));
122181
assert_eq!(accepted_tag_value_set.len(), 0);
@@ -134,4 +193,51 @@ mod tests {
134193
assert_eq!(accepted_tag_value_set.len(), 2);
135194
assert!(accepted_tag_value_set.contains(&TagValueSet::from(["value2".to_string()])));
136195
}
196+
197+
#[test]
198+
fn test_accepted_tag_value_set_fingerprint() {
199+
let mut set = AcceptedTagValueSet::new(&Mode::ExactFingerprint);
200+
201+
assert!(!set.contains(&TagValueSet::from(["value1".to_string()])));
202+
assert_eq!(set.len(), 0);
203+
204+
set.insert(TagValueSet::from(["value1".to_string()]));
205+
assert_eq!(set.len(), 1);
206+
assert!(set.contains(&TagValueSet::from(["value1".to_string()])));
207+
208+
// Inserting the same value again must not increase the count.
209+
set.insert(TagValueSet::from(["value1".to_string()]));
210+
assert_eq!(set.len(), 1);
211+
212+
set.insert(TagValueSet::from(["value2".to_string()]));
213+
assert_eq!(set.len(), 2);
214+
assert!(set.contains(&TagValueSet::from(["value2".to_string()])));
215+
216+
// An un-inserted value must not appear to be contained.
217+
assert!(!set.contains(&TagValueSet::from(["value3".to_string()])));
218+
219+
// Fingerprinting is deterministic, so a separate set must agree on membership.
220+
let mut set2 = AcceptedTagValueSet::new(&Mode::ExactFingerprint);
221+
set2.insert(TagValueSet::from(["value1".to_string()]));
222+
assert!(set2.contains(&TagValueSet::from(["value1".to_string()])));
223+
assert!(!set2.contains(&TagValueSet::from(["value3".to_string()])));
224+
}
225+
226+
#[test]
227+
fn test_fingerprint_distribution_no_collisions() {
228+
// Empirically guards the "good distribution" claim: inserting many distinct values
229+
// must yield an equal number of distinct fingerprints. At 64 bits the birthday
230+
// collision probability for 100k values is ~2.7e-10, so any collision here would
231+
// indicate a badly-distributed hash rather than bad luck.
232+
let mut set = AcceptedTagValueSet::new(&Mode::ExactFingerprint);
233+
let n = 100_000;
234+
for i in 0..n {
235+
set.insert(TagValueSet::from([format!("tag-value-{i}")]));
236+
}
237+
assert_eq!(
238+
set.len(),
239+
n,
240+
"distinct values must produce distinct fingerprints"
241+
);
242+
}
137243
}

0 commit comments

Comments
 (0)