Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions superset/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,21 @@ def _try_json_readsha(filepath: str, length: int) -> str | None:
SUPERSET_DASHBOARD_POSITION_DATA_LIMIT = 65535
CUSTOM_SECURITY_MANAGER = None
SQLALCHEMY_TRACK_MODIFICATIONS = False

# ---------------------------------------------------------
# FedRAMP Cryptographic Compliance
# ---------------------------------------------------------

# Hash algorithm used for non-cryptographic purposes (cache keys, thumbnails, etc.)
# Options: 'md5' (legacy), 'sha256'
#
# IMPORTANT: Changing this value will invalidate all existing cached content.
# Cache will re-warm naturally within 24-48 hours.
#
# For FedRAMP compliance, set to 'sha256'
# For backward compatibility with existing deployments, keep as 'md5'
HASH_ALGORITHM: Literal["md5", "sha256"] = "md5"

# ---------------------------------------------------------

# Your App secret key. Make sure you override it on superset_config.py
Expand Down
2 changes: 1 addition & 1 deletion superset/extensions/metastore_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def factory(
cls, app: Flask, config: dict[str, Any], args: list[Any], kwargs: dict[str, Any]
) -> BaseCache:
seed = config.get("CACHE_KEY_PREFIX", "")
kwargs["namespace"] = get_uuid_namespace(seed)
kwargs["namespace"] = get_uuid_namespace(seed, app)
codec = config.get("CODEC") or PickleKeyValueCodec()
if (
has_app_context()
Expand Down
33 changes: 29 additions & 4 deletions superset/key_value/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
# under the License.
from __future__ import annotations

import hashlib
from hashlib import md5
from secrets import token_urlsafe
from typing import Any
Expand Down Expand Up @@ -66,10 +67,34 @@ def decode_permalink_id(key: str, salt: str) -> int:
raise KeyValueParseKeyError(_("Invalid permalink key"))


def get_uuid_namespace(seed: str) -> UUID:
md5_obj = md5() # noqa: S324
md5_obj.update(seed.encode("utf-8"))
return UUID(md5_obj.hexdigest())
def get_uuid_namespace(seed: str, app: Any = None) -> UUID:
"""
Generate a UUID namespace from a seed string using configured hash algorithm.

Args:
seed: Seed string for namespace generation
app: Flask app instance (optional, uses current_app if not provided)

Returns:
UUID namespace
"""
if app is None:
from flask import current_app

app = current_app

algorithm = app.config["HASH_ALGORITHM"]

if algorithm == "sha256":
sha256_obj = hashlib.sha256()
sha256_obj.update(seed.encode("utf-8"))
# Use first 16 bytes of SHA-256 digest for UUID
return UUID(bytes=sha256_obj.digest()[:16])
else:
# Legacy MD5 path for backward compatibility
md5_obj = md5() # noqa: S324
md5_obj.update(seed.encode("utf-8"))
return UUID(md5_obj.hexdigest())


def get_deterministic_uuid(namespace: str, payload: Any) -> UUID:
Expand Down
93 changes: 88 additions & 5 deletions superset/utils/hashing.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,23 +14,106 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from __future__ import annotations

import hashlib
from typing import Any, Callable, Optional
import logging
from typing import Any, Callable, Literal, Optional

from flask import current_app as app

from superset.utils import json

logger = logging.getLogger(__name__)

def md5_sha_from_str(val: str) -> str:
return hashlib.md5(val.encode("utf-8")).hexdigest() # noqa: S324
HashAlgorithm = Literal["md5", "sha256"]


def md5_sha_from_dict(
def get_hash_algorithm() -> HashAlgorithm:
"""
Get the configured hash algorithm for non-cryptographic purposes.
Returns:
Hash algorithm name ('md5' or 'sha256')
"""
return app.config["HASH_ALGORITHM"]


def hash_from_str(val: str, algorithm: Optional[HashAlgorithm] = None) -> str:
"""
Generate a hash from a string using the configured or specified algorithm.
Args:
val: String to hash
algorithm: Hash algorithm to use (defaults to configured algorithm)
Returns:
Hexadecimal hash digest string
Examples:
>>> hash_from_str("test") # Uses configured algorithm
'9f86d081884c7d659a2feaa0c55ad015a3bf4f1b2b0b822cd15d6c15b0f00a08'
>>> hash_from_str("test", algorithm="md5") # Force MD5
'098f6bcd4621d373cade4e832627b4f6'
"""
if algorithm is None:
algorithm = get_hash_algorithm()

if algorithm == "sha256":
return hashlib.sha256(val.encode("utf-8")).hexdigest()
elif algorithm == "md5":
# MD5 is only acceptable for legacy compatibility
return hashlib.md5(val.encode("utf-8")).hexdigest() # noqa: S324

Check failure

Code scanning / CodeQL

Use of a broken or weak cryptographic hashing algorithm on sensitive data High

Sensitive data (certificate)
is used in a hashing algorithm (MD5) that is insecure.
Sensitive data (certificate)
is used in a hashing algorithm (MD5) that is insecure.
Sensitive data (certificate)
is used in a hashing algorithm (MD5) that is insecure.
Sensitive data (certificate)
is used in a hashing algorithm (MD5) that is insecure.
Sensitive data (certificate)
is used in a hashing algorithm (MD5) that is insecure.
Sensitive data (certificate)
is used in a hashing algorithm (MD5) that is insecure.
Sensitive data (certificate)
is used in a hashing algorithm (MD5) that is insecure.
Sensitive data (id)
is used in a hashing algorithm (MD5) that is insecure.
Sensitive data (id)
is used in a hashing algorithm (MD5) that is insecure.
Sensitive data (id)
is used in a hashing algorithm (MD5) that is insecure.
Sensitive data (id)
is used in a hashing algorithm (MD5) that is insecure.
Sensitive data (id)
is used in a hashing algorithm (MD5) that is insecure.
Sensitive data (id)
is used in a hashing algorithm (MD5) that is insecure.
Sensitive data (id)
is used in a hashing algorithm (MD5) that is insecure.
Sensitive data (id)
is used in a hashing algorithm (MD5) that is insecure.
Sensitive data (id)
is used in a hashing algorithm (MD5) that is insecure.
Sensitive data (id)
is used in a hashing algorithm (MD5) that is insecure.
Sensitive data (certificate)
is used in a hashing algorithm (MD5) that is insecure.
else:
raise ValueError(f"Unsupported hash algorithm: {algorithm}")


def hash_from_dict(
obj: dict[Any, Any],
ignore_nan: bool = False,
default: Optional[Callable[[Any], Any]] = None,
algorithm: Optional[HashAlgorithm] = None,
) -> str:
"""
Generate a hash from a dictionary using the configured or specified algorithm.
Args:
obj: Dictionary to hash
ignore_nan: Whether to ignore NaN values in JSON serialization
default: Default function for JSON serialization
algorithm: Hash algorithm to use (defaults to configured algorithm)
Returns:
Hexadecimal hash digest string
"""
json_data = json.dumps(
obj, sort_keys=True, ignore_nan=ignore_nan, default=default, allow_nan=True
)

return md5_sha_from_str(json_data)
return hash_from_str(json_data, algorithm=algorithm)


# Backward compatibility aliases
# These maintain the old function names but use the new generic implementation
def md5_sha_from_str(val: str) -> str:
"""
Legacy function name for backward compatibility.
DEPRECATED: Use hash_from_str() instead.
This function now uses the configured hash algorithm (not always MD5).
"""
return hash_from_str(val)


def md5_sha_from_dict(
obj: dict[Any, Any],
ignore_nan: bool = False,
default: Optional[Callable[[Any], Any]] = None,
) -> str:
"""
Legacy function name for backward compatibility.
DEPRECATED: Use hash_from_dict() instead.
This function now uses the configured hash algorithm (not always MD5).
"""
return hash_from_dict(obj, ignore_nan=ignore_nan, default=default)
6 changes: 3 additions & 3 deletions tests/integration_tests/utils/core_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from superset.utils.core import form_data_to_adhoc, simple_filter_to_adhoc


def test_simple_filter_to_adhoc_generates_deterministic_values():
def test_simple_filter_to_adhoc_generates_deterministic_values(app_context):
input_1 = {
"op": "IS NOT NULL",
"col": "LATITUDE",
Expand Down Expand Up @@ -51,7 +51,7 @@ def test_simple_filter_to_adhoc_generates_deterministic_values():
}


def test_form_data_to_adhoc_generates_deterministic_values():
def test_form_data_to_adhoc_generates_deterministic_values(app_context):
form_data = {"where": "1 = 1", "having": "count(*) > 1"}

# The result is the same when given the same input
Expand All @@ -77,7 +77,7 @@ def test_form_data_to_adhoc_generates_deterministic_values():
}


def test_form_data_to_adhoc_incorrect_clause_type():
def test_form_data_to_adhoc_incorrect_clause_type(app_context):
form_data = {"where": "1 = 1", "having": "count(*) > 1"}

with pytest.raises(ValueError): # noqa: PT011
Expand Down
Loading
Loading