Skip to content

feat(malware-check): add whitespace check to detect excessive spacing and invisible characters #1086

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -181,3 +181,4 @@ docs/_build
bin/
requirements.txt
.macaron_env_file
.DS_Store
2 changes: 2 additions & 0 deletions src/macaron/config/defaults.ini
Original file line number Diff line number Diff line change
Expand Up @@ -600,3 +600,5 @@ major_threshold = 20
epoch_threshold = 3
# The number of days +/- the day of publish the calendar versioning day may be.
day_publish_error = 4
# THe threshold for the number of repeated spaces in a line from the source code.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Small typo, "The" instead of "THe"

repeated_spaces_threshold =
3 changes: 3 additions & 0 deletions src/macaron/malware_analyzer/pypi_heuristics/heuristics.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ class Heuristics(str, Enum):
#: Indicates that the package has an unusually large version number for a single release.
ANOMALOUS_VERSION = "anomalous_version"

#: Indicates that the package has a lot of white spaces or invisible characters in the code.
WHITE_SPACES = "white_spaces"


class HeuristicResult(str, Enum):
"""Result type indicating the outcome of a heuristic."""
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.

"""This analyzer checks if the package has white spaces or invisible characters in the code."""

import logging
import re

from macaron.config.defaults import defaults
from macaron.json_tools import JsonType
from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer
from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics
from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset

logger: logging.Logger = logging.getLogger(__name__)


class WhiteSpacesAnalyzer(BaseHeuristicAnalyzer):
"""Check whether the code has successive white spaces or invisible characters."""

INVISIBLE_CHARS = [
"\u200b",
"\u200c",
"\u200d",
"\ufeff",
"\u200e",
"\u200f",
"\u00a0",
"\u00ad",
" ",
]

def __init__(self) -> None:
super().__init__(
name="white_spaces_analyzer",
heuristic=Heuristics.WHITE_SPACES,
depends_on=None,
)

self.repeated_spaces_threshold = self._load_defaults()

def _load_defaults(self) -> int:
"""Load default settings from defaults.ini.

Returns
-------
int:
The repeated spaces threshold.
"""
section_name = "heuristic.pypi"
default_threshold = 50

if defaults.has_section(section_name):
section = defaults[section_name]
value_str = section.get("repeated_spaces_threshold", fallback=str(default_threshold))
if value_str is not None and value_str.isdigit():
return int(value_str)
return default_threshold

return default_threshold

def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]:
"""Analyze the package.

Parameters
----------
pypi_package_json: PyPIPackageJsonAsset
The PyPI package JSON asset object.

Returns
-------
tuple[HeuristicResult, dict[str, JsonType]]:
The result and related information collected during the analysis.
"""
scripts: dict[str, str] | None = pypi_package_json.get_sourcecode()
if scripts is None:
return HeuristicResult.SKIP, {}

for file, content in scripts.items():
if file.endswith(".py") and self.has_white_spaces(content):
return HeuristicResult.FAIL, {
"file": file,
}
return HeuristicResult.PASS, {}

def has_white_spaces(self, code_string: str) -> bool:
"""Check for excessive or invisible whitespace characters in a code string.

Parameters
----------
code_string: str
The code string to check.

Returns
-------
bool:
True if suspicious patterns are found, False otherwise.
"""
char_class = "".join(self.INVISIBLE_CHARS)
regex_pattern = f"[{char_class}]{{{self.repeated_spaces_threshold},}}"
if re.search(regex_pattern, code_string, re.DOTALL):
return True
return False
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from macaron.malware_analyzer.pypi_heuristics.metadata.wheel_absence import WheelAbsenceAnalyzer
from macaron.malware_analyzer.pypi_heuristics.pypi_sourcecode_analyzer import PyPISourcecodeAnalyzer
from macaron.malware_analyzer.pypi_heuristics.sourcecode.suspicious_setup import SuspiciousSetupAnalyzer
from macaron.malware_analyzer.pypi_heuristics.sourcecode.white_spaces import WhiteSpacesAnalyzer
from macaron.slsa_analyzer.analyze_context import AnalyzeContext
from macaron.slsa_analyzer.checks.base_check import BaseCheck
from macaron.slsa_analyzer.checks.check_result import CheckResultData, CheckResultType, Confidence, JustificationType
Expand Down Expand Up @@ -332,6 +333,7 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData:
SuspiciousSetupAnalyzer,
WheelAbsenceAnalyzer,
AnomalousVersionAnalyzer,
WhiteSpacesAnalyzer,
]

# name used to query the result of all problog rules, so it can be accessed outside the model.
Expand Down Expand Up @@ -381,6 +383,10 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData:
failed({Heuristics.CLOSER_RELEASE_JOIN_DATE.value}),
forceSetup.

% Package released with excessive whitespace in the code .
{Confidence.HIGH.value}::trigger(malware_high_confidence_4) :-
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you walk me through the rationale of why we should combine WHITE_SPACES failing with the forceSetup rule and why these rules together are a malicious indicator with HIGH confidence?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

because it implies, not only that harmful code may be executed during the installation process, but also that there appears to be an effort to hide the form of this code which strongly suggests malicious motives.

forceSetup, failed({Heuristics.WHITE_SPACES.value}).

% Package released recently with little detail, with multiple releases as a trust marker, but frequent and with
% the same code.
{Confidence.MEDIUM.value}::trigger(malware_medium_confidence_1) :-
Expand All @@ -401,6 +407,7 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData:
{problog_result_access} :- trigger(malware_high_confidence_1).
{problog_result_access} :- trigger(malware_high_confidence_2).
{problog_result_access} :- trigger(malware_high_confidence_3).
{problog_result_access} :- trigger(malware_high_confidence_4).
{problog_result_access} :- trigger(malware_medium_confidence_2).
{problog_result_access} :- trigger(malware_medium_confidence_1).
query({problog_result_access}).
Expand Down
70 changes: 70 additions & 0 deletions tests/malware_analyzer/pypi/test_white_spaces.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.

"""Tests for the WhiteSpacesAnalyzer heuristic."""
# pylint: disable=redefined-outer-name


from unittest.mock import MagicMock

import pytest

from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult
from macaron.malware_analyzer.pypi_heuristics.sourcecode.white_spaces import WhiteSpacesAnalyzer


@pytest.fixture()
def analyzer() -> WhiteSpacesAnalyzer:
"""Pytest fixture to create a WhiteSpacesAnalyzer instance."""
analyzer_instance = WhiteSpacesAnalyzer()
return analyzer_instance


def test_analyze_no_sourcecode(analyzer: WhiteSpacesAnalyzer, pypi_package_json: MagicMock) -> None:
"""Test the analyzer skips when there is no source code."""
pypi_package_json.get_sourcecode.return_value = None
result, info = analyzer.analyze(pypi_package_json)
assert result == HeuristicResult.SKIP
assert info == {}


def test_analyze_pass(analyzer: WhiteSpacesAnalyzer, pypi_package_json: MagicMock) -> None:
"""Test the analyzer passes when no suspicious whitespace is found."""
pypi_package_json.get_sourcecode.return_value = {"test.py": "print('hello')"}
result, info = analyzer.analyze(pypi_package_json)
assert result == HeuristicResult.PASS
assert info == {}


def test_analyze_fail_long_spaces(analyzer: WhiteSpacesAnalyzer, pypi_package_json: MagicMock) -> None:
"""Test the analyzer fails when long spaces are found."""
repeated_spaces_threshold = analyzer.repeated_spaces_threshold
code = f"print('hello')\n{' ' * (repeated_spaces_threshold + 1)}print('world')"
pypi_package_json.get_sourcecode.return_value = {"test.py": code}
result, info = analyzer.analyze(pypi_package_json)
assert result == HeuristicResult.FAIL
assert info["file"] == "test.py"


def test_analyze_fail_invisible_chars(analyzer: WhiteSpacesAnalyzer, pypi_package_json: MagicMock) -> None:
"""Test the analyzer fails when invisible characters are found."""
repeated_spaces_threshold = analyzer.repeated_spaces_threshold
invisible_char = "\u200b" # Zero-width space.
code = f"print('hello'){invisible_char * repeated_spaces_threshold}print('world')"
pypi_package_json.get_sourcecode.return_value = {"test.py": code}
result, info = analyzer.analyze(pypi_package_json)
assert result == HeuristicResult.FAIL
assert info["file"] == "test.py"


def test_has_white_spaces_long_spaces(analyzer: WhiteSpacesAnalyzer) -> None:
"""Test has_white_spaces method with long spaces."""
repeated_spaces_threshold = analyzer.repeated_spaces_threshold
code = f"print('hello')\n{' ' * repeated_spaces_threshold}print('world')"
assert analyzer.has_white_spaces(code)


def test_has_white_spaces_no_suspicious(analyzer: WhiteSpacesAnalyzer) -> None:
"""Test has_white_spaces method with no suspicious whitespace."""
code = "print('hello')\nprint('world')"
assert not analyzer.has_white_spaces(code)
Loading