Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/lib/utils/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -182,3 +182,8 @@ osmo_py_library(
":osmo_errors",
],
)

osmo_py_library(
name = "redact",
srcs = ["redact.py"],
)
110 changes: 110 additions & 0 deletions src/lib/utils/redact.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
"""
SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

SPDX-License-Identifier: Apache-2.0
"""
import base64
import collections
import copy
import math
import re
from typing import Dict, Generator, Iterable


# Regex to match secrets in the spec. While this is not a perfect solution, it solves the majority
# of cases. Regex from: https://lookingatcomputer.substack.com/p/regex-is-almost-all-you-need
# Proper secret management:
# https://nvidia.github.io/OSMO/main/user_guide/getting_started/credentials.html
SECRET_REDACTION_RE = re.compile(
r'''(?i)[\w.-]{0,50}?(?:access|auth|(?-i:[Aa]pi|API)|credential|creds|key|passw(?:or)?d|secret|token)(?:[ \t\w.-]{0,20})[\s'"]{0,3}(?:=|>|:{1,3}=|\|\||:|=>|\?=|,)[\x60'"\s=]{0,5}([\w.=-]{10,150}|[a-z0-9][a-z0-9+/]{11,}={0,3})(?:[\x60'"\s;]|\\[nr]|$)''' # pylint: disable=line-too-long
)

# Matches base64-encoded fragments: at least 16 chars of base64 alphabet with optional padding,
# not adjacent to other base64 characters (to capture complete tokens).
_BASE64_FRAGMENT_RE = re.compile(
r'(?<![A-Za-z0-9+/])[A-Za-z0-9+/]{16,}={0,2}(?![A-Za-z0-9+/=])'
)


_ENTROPY_THRESHOLD = 3.0


def _shannon_entropy(data: str) -> float:
"""
Calculate the Shannon entropy of a string (bits per character).
https://en.wiktionary.org/wiki/Shannon_entropy
"""
if not data:
return 0.0
inv_length = 1.0 / len(data)
entropy = 0.0
for count in collections.Counter(data).values():
freq = count * inv_length
entropy -= freq * math.log2(freq)
return entropy


def redact_pod_spec_env(pod_spec: Dict) -> Dict:
"""
Return a deep copy of pod_spec with high-entropy env var values replaced by [MASKED].

Only values whose Shannon entropy exceeds _ENTROPY_THRESHOLD are masked, leaving
low-entropy values like 'true', 'false', or plain URLs untouched. Covers both
'containers' and 'initContainers'. Entries that use 'valueFrom' (i.e. have no
'value' key) are left untouched.
"""
pod_spec = copy.deepcopy(pod_spec)
for container_list_key in ('containers', 'initContainers'):
for container in pod_spec.get('spec', pod_spec).get(container_list_key, []):
for env_entry in container.get('env', []):
if 'value' in env_entry and \
_shannon_entropy(env_entry['value']) > _ENTROPY_THRESHOLD:
env_entry['value'] = '[MASKED]'
return pod_spec


def redact_secrets(lines: Iterable[str]) -> Generator[str, None, None]:
"""
Yield lines with secrets redacted.

Scans each line for key=value patterns that look like secrets and replaces
the value with [MASKED]. Also detects base64-encoded fragments, decodes them,
and replaces the whole fragment with [MASKED] if secrets are found inside.
"""
def redact_base64_fragments(line: str) -> str:
"""
Find base64-encoded fragments in a line, decode them, redact any secrets found inside,
and replace the whole fragment with [MASKED].
"""
def replace_if_secrets(m: re.Match) -> str:
fragment = m.group(0)
try:
padded = fragment + '=' * (-len(fragment) % 4)
decoded = base64.b64decode(padded, validate=True).decode('utf-8')
except (ValueError, UnicodeDecodeError):
return fragment
redacted = SECRET_REDACTION_RE.sub(
lambda sm: sm.group(0).replace(sm.group(1), '[MASKED]'),
decoded,
)
if redacted == decoded:
return fragment
return '[MASKED]'
return _BASE64_FRAGMENT_RE.sub(replace_if_secrets, line)

for line in lines:
line = redact_base64_fragments(line)
yield SECRET_REDACTION_RE.sub(
lambda m: m.group(0).replace(m.group(1), '[MASKED]'), line)
8 changes: 8 additions & 0 deletions src/lib/utils/tests/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,11 @@ osmo_py_test(
"//src/lib/utils:jinja_sandbox",
]
)

osmo_py_test(
name = "test_redact_secrets",
srcs = ["test_redact_secrets.py"],
deps = [
"//src/lib/utils:redact",
],
)
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@
import base64
import textwrap
import unittest
from typing import Any

from src.service.core.workflow.workflow_service import redact_secrets
from src.lib.utils.redact import redact_pod_spec_env, redact_secrets


# The AWS keys used below are the well-known example credentials from the AWS documentation
Expand Down Expand Up @@ -94,5 +95,52 @@ def test_leaves_safe_base64_untouched(self):
self.assertIn(encoded, redacted)


class TestRedactPodSpecEnv(unittest.TestCase):
"""redact_pod_spec_env masks high-entropy values and leaves low-entropy values untouched."""

def _make_pod_spec(self, *containers: Any) -> dict:
return {'containers': containers, 'initContainers': []}

def test_masks_high_entropy_secret(self):
pod_spec = self._make_pod_spec(
{'name': 'app', 'env': [{'name': 'AWS_SECRET_ACCESS_KEY', 'value': _AWS_SECRET_KEY}]},
)
redacted = redact_pod_spec_env(pod_spec)
self.assertEqual(redacted['containers'][0]['env'][0]['value'], '[MASKED]')

def test_preserves_low_entropy_value(self):
pod_spec = self._make_pod_spec(
{'name': 'app', 'env': [{'name': 'ENABLE_FEATURE', 'value': 'true'}]},
)
redacted = redact_pod_spec_env(pod_spec)
self.assertEqual(redacted['containers'][0]['env'][0]['value'], 'true')

def test_does_not_modify_original(self):
pod_spec = self._make_pod_spec(
{'name': 'app', 'env': [{'name': 'AWS_SECRET_ACCESS_KEY', 'value': _AWS_SECRET_KEY}]},
)
redact_pod_spec_env(pod_spec)
self.assertEqual(pod_spec['containers'][0]['env'][0]['value'], _AWS_SECRET_KEY)

def test_masks_in_init_containers(self):
pod_spec = {
'containers': [],
'initContainers': [
{'name': 'init', 'env': [
{'name': 'AWS_SECRET_ACCESS_KEY', 'value': _AWS_SECRET_KEY},
]},
],
}
redacted = redact_pod_spec_env(pod_spec)
self.assertEqual(redacted['initContainers'][0]['env'][0]['value'], '[MASKED]')

def test_leaves_value_from_untouched(self):
pod_spec = self._make_pod_spec(
{'name': 'app', 'env': [{'name': 'MY_SECRET', 'valueFrom': {'secretKeyRef': {'name': 'my-secret', 'key': 'value'}}}]}, # pylint: disable=line-too-long
)
redacted = redact_pod_spec_env(pod_spec)
self.assertNotIn('value', redacted['containers'][0]['env'][0])


if __name__ == '__main__':
unittest.main()
1 change: 1 addition & 0 deletions src/service/core/workflow/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ osmo_py_library(
"//src/lib/utils:login",
"//src/lib/utils:priority",
"//src/lib/utils:osmo_errors",
"//src/lib/utils:redact",
"//src/utils:static_config",
"//src/utils/job:job",
"//src/utils:yaml",
Expand Down
7 changes: 7 additions & 0 deletions src/service/core/workflow/objects.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from src.lib.data import storage
from src.lib.data.storage.credentials import credentials as data_credentials
from src.lib.utils import credentials, common, osmo_errors, priority as wf_priority
from src.lib.utils.redact import redact_secrets
import src.lib.utils.logging
from src.utils.job import app, common as task_common, jobs, kb_objects, task, workflow
from src.utils import connectors, static_config, yaml as util_yaml
Expand Down Expand Up @@ -902,10 +903,16 @@ def convert_task_file_contents(curr_task_spec: Dict):
convert_task_file_contents(task_spec)

workflow_spec = yaml.dump(workflow_dict, default_flow_style=False, allow_unicode=True)

# Redact secrets in the workflow spec
workflow_spec = ''.join(redact_secrets((workflow_spec,)))

files = [
jobs.File(path=common.WORKFLOW_SPEC_FILE_NAME, content=workflow_spec)
]
if original_templated_spec is not None:
# Redact secrets in the original templated spec
original_templated_spec = ''.join(redact_secrets((original_templated_spec,)))
files.append(jobs.File(
path=common.TEMPLATED_WORKFLOW_SPEC_FILE_NAME,
content=original_templated_spec))
Expand Down
8 changes: 0 additions & 8 deletions src/service/core/workflow/tests/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,3 @@ py_test(
],
)

py_test(
name = "test_redact_secrets",
srcs = ["test_redact_secrets.py"],
deps = [
"//src/service/core/workflow",
],
)

47 changes: 2 additions & 45 deletions src/service/core/workflow/workflow_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
SPDX-License-Identifier: Apache-2.0
"""

import base64
import collections
import dataclasses
import datetime
Expand All @@ -26,7 +25,7 @@
import json
import logging
import re
from typing import Any, AsyncGenerator, Dict, Generator, Iterable, List, Optional
from typing import Any, AsyncGenerator, Dict, Generator, List, Optional
import urllib.parse
import yaml

Expand All @@ -36,6 +35,7 @@

from src.lib.data import storage
from src.lib.utils import common, credentials, login, osmo_errors, priority as wf_priority
from src.lib.utils.redact import redact_secrets
from src.utils.job import common as job_common, jobs, workflow, task
from src.service.core.workflow import helpers, objects
from src.utils import connectors
Expand All @@ -49,49 +49,6 @@

FETCH_TASK_LIMIT = 1000

# Regex to match secrets in the spec. While this is not a perfect solution, it solves the majority
# of cases.
# Regex from: https://lookingatcomputer.substack.com/p/regex-is-almost-all-you-need
# Proper secret management:
# https://nvidia.github.io/OSMO/main/user_guide/getting_started/credentials.html
SECRET_REDACTION_RE = re.compile(
r'''(?i)[\w.-]{0,50}?(?:access|auth|(?-i:[Aa]pi|API)|credential|creds|key|passw(?:or)?d|secret|token)(?:[ \t\w.-]{0,20})[\s'"]{0,3}(?:=|>|:{1,3}=|\|\||:|=>|\?=|,)[\x60'"\s=]{0,5}([\w.=-]{10,150}|[a-z0-9][a-z0-9+/]{11,}={0,3})(?:[\x60'"\s;]|\\[nr]|$)''' # pylint: disable=line-too-long
)

# Matches base64-encoded fragments: at least 16 chars of base64 alphabet with optional padding,
# not adjacent to other base64 characters (to capture complete tokens).
_BASE64_FRAGMENT_RE = re.compile(r'(?<![A-Za-z0-9+/])[A-Za-z0-9+/]{16,}={0,2}(?![A-Za-z0-9+/=])')


def redact_secrets(lines: Iterable[str]) -> Generator[str, None, None]:
""" Yield lines with secrets in the spec redacted. """
def redact_base64_fragments(line: str) -> str:
"""
Find base64-encoded fragments in a line, decode them, redact any secrets found inside,
and replace the whole fragment with [MASKED].
"""
def replace_if_secrets(m: re.Match) -> str:
fragment = m.group(0)
try:
padded = fragment + '=' * (-len(fragment) % 4)
decoded = base64.b64decode(padded, validate=True).decode('utf-8')
except (ValueError, UnicodeDecodeError):
return fragment
redacted = SECRET_REDACTION_RE.sub(
lambda sm: sm.group(0).replace(sm.group(1), '[MASKED]'),
decoded,
)
if redacted == decoded:
return fragment
return '[MASKED]'
return _BASE64_FRAGMENT_RE.sub(replace_if_secrets, line)

for line in lines:
line = redact_base64_fragments(line)
yield SECRET_REDACTION_RE.sub(
lambda m: m.group(0).replace(m.group(1), '[MASKED]'), line)


class ActionType(enum.Enum):
EXEC = 'exec'
PORTFORWARD = 'portforward'
Expand Down
1 change: 1 addition & 0 deletions src/utils/job/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ osmo_py_library(
"//src/lib/utils:jinja_sandbox",
"//src/lib/utils:priority",
"//src/lib/utils:osmo_errors",
"//src/lib/utils:redact",
"//src/lib/utils:workflow",
"//src/utils:notify",
"//src/utils:yaml",
Expand Down
3 changes: 2 additions & 1 deletion src/utils/job/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@

from src.lib.data import storage
from src.lib.utils import common, osmo_errors, priority as wf_priority
from src.lib.utils.redact import redact_pod_spec_env
from src.utils import connectors
from src.utils.job import app, backend_job_defs, common as task_common, kb_objects, task, workflow
from src.utils.job.jobs_base import Job, JobResult, JobStatus, update_progress_writer
Expand Down Expand Up @@ -480,7 +481,7 @@ def prepare_execute(self, context: JobExecutionContext,
upload_task = UploadWorkflowFiles(
workflow_id=workflow_obj.workflow_id,
workflow_uuid=self.workflow_uuid,
files=[File(f'{task_name}.spec', yaml.dump(pod_spec))
files=[File(f'{task_name}.spec', yaml.dump(redact_pod_spec_env(pod_spec)))
for task_name, pod_spec in pod_specs.items()])
upload_task.send_job_to_queue()

Expand Down
Loading