Skip to content

Commit 58132ba

Browse files
authored
feat: remove not needed encryption of secrets (#1123)
* feat: remove not needed encryption of secrets Instead use an uuid generator as we do for pii, and reuse same session store mechanism Closes: #929 * fix tests * unify interface in sensitive data * add missing tests * changes from rebase * fixes from review * fixes in tests * fix tests
1 parent a031791 commit 58132ba

21 files changed

+500
-1086
lines changed

src/codegate/cli.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from codegate.config import Config, ConfigurationError
1717
from codegate.db.connection import init_db_sync, init_session_if_not_exists
1818
from codegate.pipeline.factory import PipelineFactory
19-
from codegate.pipeline.secrets.manager import SecretsManager
19+
from codegate.pipeline.sensitive_data.manager import SensitiveDataManager
2020
from codegate.providers import crud as provendcrud
2121
from codegate.providers.copilot.provider import CopilotProvider
2222
from codegate.server import init_app
@@ -331,8 +331,8 @@ def serve( # noqa: C901
331331
click.echo("Existing Certificates are already present.")
332332

333333
# Initialize secrets manager and pipeline factory
334-
secrets_manager = SecretsManager()
335-
pipeline_factory = PipelineFactory(secrets_manager)
334+
sensitive_data_manager = SensitiveDataManager()
335+
pipeline_factory = PipelineFactory(sensitive_data_manager)
336336

337337
app = init_app(pipeline_factory)
338338

src/codegate/pipeline/base.py

+8-19
Original file line numberDiff line numberDiff line change
@@ -12,34 +12,23 @@
1212
from codegate.clients.clients import ClientType
1313
from codegate.db.models import Alert, AlertSeverity, Output, Prompt
1414
from codegate.extract_snippets.message_extractor import CodeSnippet
15-
from codegate.pipeline.secrets.manager import SecretsManager
15+
from codegate.pipeline.sensitive_data.manager import SensitiveDataManager
1616

1717
logger = structlog.get_logger("codegate")
1818

1919

2020
@dataclass
2121
class PipelineSensitiveData:
22-
manager: SecretsManager
22+
manager: SensitiveDataManager
2323
session_id: str
24-
api_key: Optional[str] = None
2524
model: Optional[str] = None
26-
provider: Optional[str] = None
27-
api_base: Optional[str] = None
2825

2926
def secure_cleanup(self):
3027
"""Securely cleanup sensitive data for this session"""
3128
if self.manager is None or self.session_id == "":
3229
return
33-
3430
self.manager.cleanup_session(self.session_id)
3531
self.session_id = ""
36-
37-
# Securely wipe the API key using the same method as secrets manager
38-
if self.api_key is not None:
39-
api_key_bytes = bytearray(self.api_key.encode())
40-
self.manager.crypto.wipe_bytearray(api_key_bytes)
41-
self.api_key = None
42-
4332
self.model = None
4433

4534

@@ -274,19 +263,19 @@ class InputPipelineInstance:
274263
def __init__(
275264
self,
276265
pipeline_steps: List[PipelineStep],
277-
secret_manager: SecretsManager,
266+
sensitive_data_manager: SensitiveDataManager,
278267
is_fim: bool,
279268
client: ClientType = ClientType.GENERIC,
280269
):
281270
self.pipeline_steps = pipeline_steps
282-
self.secret_manager = secret_manager
271+
self.sensitive_data_manager = sensitive_data_manager
283272
self.is_fim = is_fim
284273
self.context = PipelineContext(client=client)
285274

286275
# we create the sesitive context here so that it is not shared between individual requests
287276
# TODO: could we get away with just generating the session ID for an instance?
288277
self.context.sensitive = PipelineSensitiveData(
289-
manager=self.secret_manager,
278+
manager=self.sensitive_data_manager,
290279
session_id=str(uuid.uuid4()),
291280
)
292281
self.context.metadata["is_fim"] = is_fim
@@ -343,20 +332,20 @@ class SequentialPipelineProcessor:
343332
def __init__(
344333
self,
345334
pipeline_steps: List[PipelineStep],
346-
secret_manager: SecretsManager,
335+
sensitive_data_manager: SensitiveDataManager,
347336
client_type: ClientType,
348337
is_fim: bool,
349338
):
350339
self.pipeline_steps = pipeline_steps
351-
self.secret_manager = secret_manager
340+
self.sensitive_data_manager = sensitive_data_manager
352341
self.is_fim = is_fim
353342
self.instance = self._create_instance(client_type)
354343

355344
def _create_instance(self, client_type: ClientType) -> InputPipelineInstance:
356345
"""Create a new pipeline instance for processing a request"""
357346
return InputPipelineInstance(
358347
self.pipeline_steps,
359-
self.secret_manager,
348+
self.sensitive_data_manager,
360349
self.is_fim,
361350
client_type,
362351
)

src/codegate/pipeline/factory.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -12,18 +12,18 @@
1212
PiiRedactionNotifier,
1313
PiiUnRedactionStep,
1414
)
15-
from codegate.pipeline.secrets.manager import SecretsManager
1615
from codegate.pipeline.secrets.secrets import (
1716
CodegateSecrets,
1817
SecretRedactionNotifier,
1918
SecretUnredactionStep,
2019
)
20+
from codegate.pipeline.sensitive_data.manager import SensitiveDataManager
2121
from codegate.pipeline.system_prompt.codegate import SystemPrompt
2222

2323

2424
class PipelineFactory:
25-
def __init__(self, secrets_manager: SecretsManager):
26-
self.secrets_manager = secrets_manager
25+
def __init__(self, sensitive_data_manager: SensitiveDataManager):
26+
self.sensitive_data_manager = sensitive_data_manager
2727

2828
def create_input_pipeline(self, client_type: ClientType) -> SequentialPipelineProcessor:
2929
input_steps: List[PipelineStep] = [
@@ -32,7 +32,7 @@ def create_input_pipeline(self, client_type: ClientType) -> SequentialPipelinePr
3232
# and without obfuscating the secrets, we'd leak the secrets during those
3333
# later steps
3434
CodegateSecrets(),
35-
CodegatePii(),
35+
CodegatePii(self.sensitive_data_manager),
3636
CodegateCli(),
3737
CodegateContextRetriever(),
3838
SystemPrompt(
@@ -41,19 +41,19 @@ def create_input_pipeline(self, client_type: ClientType) -> SequentialPipelinePr
4141
]
4242
return SequentialPipelineProcessor(
4343
input_steps,
44-
self.secrets_manager,
44+
self.sensitive_data_manager,
4545
client_type,
4646
is_fim=False,
4747
)
4848

4949
def create_fim_pipeline(self, client_type: ClientType) -> SequentialPipelineProcessor:
5050
fim_steps: List[PipelineStep] = [
5151
CodegateSecrets(),
52-
CodegatePii(),
52+
CodegatePii(self.sensitive_data_manager),
5353
]
5454
return SequentialPipelineProcessor(
5555
fim_steps,
56-
self.secrets_manager,
56+
self.sensitive_data_manager,
5757
client_type,
5858
is_fim=True,
5959
)

src/codegate/pipeline/pii/analyzer.py

+18-102
Original file line numberDiff line numberDiff line change
@@ -1,47 +1,16 @@
1-
import uuid
2-
from typing import Any, Dict, List, Optional, Tuple
1+
from typing import Any, List, Optional
32

43
import structlog
54
from presidio_analyzer import AnalyzerEngine
65
from presidio_anonymizer import AnonymizerEngine
76

87
from codegate.db.models import AlertSeverity
98
from codegate.pipeline.base import PipelineContext
9+
from codegate.pipeline.sensitive_data.session_store import SessionStore
1010

1111
logger = structlog.get_logger("codegate.pii.analyzer")
1212

1313

14-
class PiiSessionStore:
15-
"""
16-
A class to manage PII (Personally Identifiable Information) session storage.
17-
18-
Attributes:
19-
session_id (str): The unique identifier for the session. If not provided, a new UUID
20-
is generated. mappings (Dict[str, str]): A dictionary to store mappings between UUID
21-
placeholders and PII.
22-
23-
Methods:
24-
add_mapping(pii: str) -> str:
25-
Adds a PII string to the session store and returns a UUID placeholder for it.
26-
27-
get_pii(uuid_placeholder: str) -> str:
28-
Retrieves the PII string associated with the given UUID placeholder. If the placeholder
29-
is not found, returns the placeholder itself.
30-
"""
31-
32-
def __init__(self, session_id: str = None):
33-
self.session_id = session_id or str(uuid.uuid4())
34-
self.mappings: Dict[str, str] = {}
35-
36-
def add_mapping(self, pii: str) -> str:
37-
uuid_placeholder = f"<{str(uuid.uuid4())}>"
38-
self.mappings[uuid_placeholder] = pii
39-
return uuid_placeholder
40-
41-
def get_pii(self, uuid_placeholder: str) -> str:
42-
return self.mappings.get(uuid_placeholder, uuid_placeholder)
43-
44-
4514
class PiiAnalyzer:
4615
"""
4716
PiiAnalyzer class for analyzing and anonymizing text containing PII.
@@ -52,12 +21,12 @@ class PiiAnalyzer:
5221
Get or create the singleton instance of PiiAnalyzer.
5322
analyze:
5423
text (str): The text to analyze for PII.
55-
Tuple[str, List[Dict[str, Any]], PiiSessionStore]: The anonymized text, a list of
24+
Tuple[str, List[Dict[str, Any]], SessionStore]: The anonymized text, a list of
5625
found PII details, and the session store.
5726
entities (List[str]): The PII entities to analyze for.
5827
restore_pii:
5928
anonymized_text (str): The text with anonymized PII.
60-
session_store (PiiSessionStore): The PiiSessionStore used for anonymization.
29+
session_store (SessionStore): The SessionStore used for anonymization.
6130
str: The text with original PII restored.
6231
"""
6332

@@ -95,13 +64,11 @@ def __init__(self):
9564
# Create analyzer with custom NLP engine
9665
self.analyzer = AnalyzerEngine(nlp_engine=nlp_engine)
9766
self.anonymizer = AnonymizerEngine()
98-
self.session_store = PiiSessionStore()
67+
self.session_store = SessionStore()
9968

10069
PiiAnalyzer._instance = self
10170

102-
def analyze(
103-
self, text: str, context: Optional[PipelineContext] = None
104-
) -> Tuple[str, List[Dict[str, Any]], PiiSessionStore]:
71+
def analyze(self, text: str, context: Optional[PipelineContext] = None) -> List:
10572
# Prioritize credit card detection first
10673
entities = [
10774
"PHONE_NUMBER",
@@ -125,81 +92,30 @@ def analyze(
12592
language="en",
12693
score_threshold=0.3, # Lower threshold to catch more potential matches
12794
)
95+
return analyzer_results
12896

129-
# Track found PII
130-
found_pii = []
131-
132-
# Only anonymize if PII was found
133-
if analyzer_results:
134-
# Log each found PII instance and anonymize
135-
anonymized_text = text
136-
for result in analyzer_results:
137-
pii_value = text[result.start : result.end]
138-
uuid_placeholder = self.session_store.add_mapping(pii_value)
139-
pii_info = {
140-
"type": result.entity_type,
141-
"value": pii_value,
142-
"score": result.score,
143-
"start": result.start,
144-
"end": result.end,
145-
"uuid_placeholder": uuid_placeholder,
146-
}
147-
found_pii.append(pii_info)
148-
anonymized_text = anonymized_text.replace(pii_value, uuid_placeholder)
149-
150-
# Log each PII detection with its UUID mapping
151-
logger.info(
152-
"PII detected and mapped",
153-
pii_type=result.entity_type,
154-
score=f"{result.score:.2f}",
155-
uuid=uuid_placeholder,
156-
# Don't log the actual PII value for security
157-
value_length=len(pii_value),
158-
session_id=self.session_store.session_id,
159-
)
160-
161-
# Log summary of all PII found in this analysis
162-
if found_pii and context:
163-
# Create notification string for alert
164-
notify_string = (
165-
f"**PII Detected** 🔒\n"
166-
f"- Total PII Found: {len(found_pii)}\n"
167-
f"- Types Found: {', '.join(set(p['type'] for p in found_pii))}\n"
168-
)
169-
context.add_alert(
170-
self._name,
171-
trigger_string=notify_string,
172-
severity_category=AlertSeverity.CRITICAL,
173-
)
174-
175-
logger.info(
176-
"PII analysis complete",
177-
total_pii_found=len(found_pii),
178-
pii_types=[p["type"] for p in found_pii],
179-
session_id=self.session_store.session_id,
180-
)
181-
182-
# Return the anonymized text, PII details, and session store
183-
return anonymized_text, found_pii, self.session_store
184-
185-
# If no PII found, return original text, empty list, and session store
186-
return text, [], self.session_store
187-
188-
def restore_pii(self, anonymized_text: str, session_store: PiiSessionStore) -> str:
97+
def restore_pii(self, session_id: str, anonymized_text: str) -> str:
18998
"""
19099
Restore the original PII (Personally Identifiable Information) in the given anonymized text.
191100
192101
This method replaces placeholders in the anonymized text with their corresponding original
193-
PII values using the mappings stored in the provided PiiSessionStore.
102+
PII values using the mappings stored in the provided SessionStore.
194103
195104
Args:
196105
anonymized_text (str): The text containing placeholders for PII.
197-
session_store (PiiSessionStore): The session store containing mappings of placeholders
106+
session_id (str): The session id containing mappings of placeholders
198107
to original PII.
199108
200109
Returns:
201110
str: The text with the original PII restored.
202111
"""
203-
for uuid_placeholder, original_pii in session_store.mappings.items():
112+
session_data = self.session_store.get_by_session_id(session_id)
113+
if not session_data:
114+
logger.warning(
115+
"No active PII session found for given session ID. Unable to restore PII."
116+
)
117+
return anonymized_text
118+
119+
for uuid_placeholder, original_pii in session_data.items():
204120
anonymized_text = anonymized_text.replace(uuid_placeholder, original_pii)
205121
return anonymized_text

0 commit comments

Comments
 (0)