Skip to content

Commit 9e00767

Browse files
fix: ensure consistent placeholder assignment for identical values (#283)
* fix: ensure consistent placeholder assignment for identical values in Anonymize scanner - Check vault for existing value matches before creating new placeholders - Reuse existing placeholder indices for known values - Calculate next available index for new values to prevent conflicts - Add batch consistency tracking within single processing runs - Prevents duplicate vault entries for identical entity values - Maintains referential integrity for proper deanonymization Fixes issue where same entity values (e.g., "John Smith") would get different placeholders ([REDACTED_PERSON_1], [REDACTED_PERSON_2]) across multiple scans, making consistent anonymization impossible. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <[email protected]> * style: apply code formatting and fix performance warning in anonymize.py - Apply ruff code formatting to improve readability - Optimize try-except logic to avoid performance overhead in loop - Pre-filter placeholder parts before attempting index extraction 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <[email protected]> * test: add comprehensive test for placeholder consistency in Anonymize scanner - Test ensures identical values get same placeholders across multiple scans - Verifies no duplicate entries in vault for same entity values - Confirms proper index assignment and reuse logic - Validates referential integrity for deanonymization 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <[email protected]> * style: apply code formatting to test assertions Reformat multi-line assert statements to follow consistent style guidelines. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <[email protected]> * Trigger actions: empty commit * Empty commit to retrigger git actions * fix space * Disable caching and add cleanup --------- Co-authored-by: Claude <[email protected]>
1 parent 53af270 commit 9e00767

File tree

3 files changed

+140
-14
lines changed

3 files changed

+140
-14
lines changed

.github/workflows/test.yml

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,15 +32,37 @@ jobs:
3232

3333
steps:
3434
- uses: actions/checkout@v4
35+
36+
- name: Aggressive disk cleanup
37+
run: |
38+
sudo rm -rf /usr/local/lib/android
39+
docker system prune -af
40+
sudo apt-get clean
41+
sudo rm -rf /var/lib/apt/lists/*
42+
sudo rm -rf /tmp/*
43+
sudo rm -rf /var/tmp/*
44+
rm -rf ~/.cache/huggingface
45+
rm -rf ~/.cache/pip
46+
df -h
47+
3548
- name: Set up Python ${{ matrix.python-version }}
3649
uses: actions/setup-python@v5
3750
with:
3851
python-version: ${{ matrix.python-version }}
39-
cache: 'pip'
52+
cache: '' # disables pip cache
53+
4054
- name: Install deps
4155
run: |
4256
python -m pip install --upgrade pip
4357
python -m pip install ".[dev]"
58+
59+
- name: Free up space & Hugging Face cache
60+
run: |
61+
rm -rf ~/.cache/huggingface
62+
df -h
63+
4464
- name: Test with pytest
65+
env:
66+
HF_HUB_DISABLE_CACHE: 1 # prevents cache from filling up runner disk
4567
run: |
4668
python -m pytest --exitfirst --verbose --failed-first --cov=. --color=yes --code-highlight=yes

llm_guard/input_scanners/anonymize.py

Lines changed: 51 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,9 @@ def _anonymize(
255255
"""
256256
text_replace_builder = TextReplaceBuilder(original_text=prompt)
257257

258-
entity_type_counter, new_entity_counter = {}, {}
258+
entity_type_counter = {}
259+
batch_entity_tracker = {}
260+
259261
for pii_entity in pii_entities:
260262
entity_type = pii_entity.entity_type
261263
entity_value = text_replace_builder.get_text_in_position(
@@ -266,25 +268,60 @@ def _anonymize(
266268
entity_type_counter[entity_type] = {}
267269

268270
if entity_value not in entity_type_counter[entity_type]:
271+
# Check vault for existing exact match first
269272
vault_entities = [
270273
(entity_placeholder, entity_vault_value)
271274
for entity_placeholder, entity_vault_value in vault.get()
272-
if entity_type in entity_placeholder
273-
]
274-
entity_placeholder = [
275-
entity_placeholder
276-
for entity_placeholder, entity_vault_value in vault_entities
277-
if entity_vault_value == entity_value
275+
if entity_placeholder.startswith(f"[REDACTED_{entity_type}_")
278276
]
279-
if len(entity_placeholder) > 0:
277+
278+
# Look for exact value match in vault
279+
existing_placeholder = None
280+
for entity_placeholder, entity_vault_value in vault_entities:
281+
if entity_vault_value == entity_value:
282+
existing_placeholder = entity_placeholder
283+
break
284+
285+
if existing_placeholder:
286+
# Extract index from existing placeholder
280287
entity_type_counter[entity_type][entity_value] = int(
281-
entity_placeholder[0].split("_")[-1][:-1]
288+
existing_placeholder.split("_")[-1][:-1]
282289
)
283290
else:
284-
entity_type_counter[entity_type][entity_value] = (
285-
len(vault_entities) + new_entity_counter.get(entity_type, 0) + 1
286-
)
287-
new_entity_counter[entity_type] = new_entity_counter.get(entity_type, 0) + 1
291+
# Check if we've already assigned this value in current batch
292+
if entity_type not in batch_entity_tracker:
293+
batch_entity_tracker[entity_type] = {}
294+
295+
if entity_value in batch_entity_tracker[entity_type]:
296+
# Reuse the index assigned earlier in this batch
297+
entity_type_counter[entity_type][entity_value] = batch_entity_tracker[
298+
entity_type
299+
][entity_value]
300+
else:
301+
# Calculate next available index
302+
existing_indices = set()
303+
304+
# Add indices from vault
305+
for entity_placeholder, _ in vault_entities:
306+
parts = entity_placeholder.split("_")
307+
if len(parts) >= 3 and parts[-1].endswith("]"):
308+
try:
309+
index = int(parts[-1][:-1])
310+
existing_indices.add(index)
311+
except ValueError:
312+
pass # Skip invalid indices
313+
314+
# Add indices from current batch
315+
for assigned_index in batch_entity_tracker.get(entity_type, {}).values():
316+
existing_indices.add(assigned_index)
317+
318+
# Find next available index
319+
next_index = 1
320+
while next_index in existing_indices:
321+
next_index += 1
322+
323+
entity_type_counter[entity_type][entity_value] = next_index
324+
batch_entity_tracker[entity_type][entity_value] = next_index
288325

289326
results = []
290327
sorted_pii_entities = sorted(pii_entities, reverse=True)
@@ -347,6 +384,7 @@ def scan(self, prompt: str) -> tuple[str, bool, float]:
347384
for entity_placeholder, entity_value in anonymized_results:
348385
if not self._vault.placeholder_exists(entity_placeholder):
349386
self._vault.append((entity_placeholder, entity_value))
387+
350388
return (
351389
self._preamble + sanitized_prompt,
352390
False,

tests/input_scanners/test_anonymize.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -327,3 +327,69 @@ def test_patterns():
327327
assert re.search(expression, example) is not None, (
328328
f"Test for {name} failed. No match found for example: {example}"
329329
)
330+
331+
332+
def test_placeholder_consistency():
333+
"""Test that identical values get the same placeholder across multiple scans."""
334+
vault = Vault()
335+
scanner = Anonymize(vault, entity_types=["PERSON", "EMAIL_ADDRESS"])
336+
337+
# First scan with John Smith
338+
prompt1 = "Hello John Smith, your email is [email protected]"
339+
result1, valid1, score1 = scanner.scan(prompt1)
340+
341+
# Second scan with same person
342+
prompt2 = "John Smith called yesterday"
343+
result2, valid2, score2 = scanner.scan(prompt2)
344+
345+
# Third scan with different person
346+
prompt3 = "Mary Johnson replied to [email protected]"
347+
result3, valid3, score3 = scanner.scan(prompt3)
348+
349+
# Fourth scan with both people
350+
prompt4 = "John Smith and Mary Johnson are meeting"
351+
result4, valid4, score4 = scanner.scan(prompt4)
352+
353+
# Verify vault contents
354+
vault_contents = vault.get()
355+
356+
# Assert consistent placeholder assignment
357+
john_placeholders = [p for p, v in vault_contents if v == "John Smith"]
358+
mary_placeholders = [p for p, v in vault_contents if v == "Mary Johnson"]
359+
email_placeholders = [p for p, v in vault_contents if v == "[email protected]"]
360+
361+
# Each unique value should have exactly one placeholder
362+
assert len(john_placeholders) == 1, (
363+
f"John Smith should have 1 placeholder, got {len(john_placeholders)}: {john_placeholders}"
364+
)
365+
assert len(mary_placeholders) == 1, (
366+
f"Mary Johnson should have 1 placeholder, got {len(mary_placeholders)}: {mary_placeholders}"
367+
)
368+
assert len(email_placeholders) == 1, (
369+
f"[email protected] should have 1 placeholder, got {len(email_placeholders)}: {email_placeholders}"
370+
)
371+
372+
# Verify specific placeholder assignments
373+
john_placeholder = john_placeholders[0]
374+
mary_placeholder = mary_placeholders[0]
375+
email_placeholder = email_placeholders[0]
376+
377+
# John should be PERSON_1, Mary should be PERSON_2, email should be EMAIL_ADDRESS_1
378+
assert john_placeholder == "[REDACTED_PERSON_1]", (
379+
f"Expected [REDACTED_PERSON_1], got {john_placeholder}"
380+
)
381+
assert mary_placeholder == "[REDACTED_PERSON_2]", (
382+
f"Expected [REDACTED_PERSON_2], got {mary_placeholder}"
383+
)
384+
assert email_placeholder == "[REDACTED_EMAIL_ADDRESS_1]", (
385+
f"Expected [REDACTED_EMAIL_ADDRESS_1], got {email_placeholder}"
386+
)
387+
388+
# Verify results contain consistent placeholders
389+
assert john_placeholder in result1
390+
assert john_placeholder in result2
391+
assert john_placeholder in result4
392+
assert mary_placeholder in result3
393+
assert mary_placeholder in result4
394+
assert email_placeholder in result1
395+
assert email_placeholder in result3

0 commit comments

Comments
 (0)