-
Notifications
You must be signed in to change notification settings - Fork 28
feat(malware-check): add whitespace check to detect excessive spacing and invisible characters #1086
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
feat(malware-check): add whitespace check to detect excessive spacing and invisible characters #1086
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -181,3 +181,4 @@ docs/_build | |
bin/ | ||
requirements.txt | ||
.macaron_env_file | ||
.DS_Store |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. | ||
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. | ||
|
||
"""This analyzer checks if the package has white spaces or invisible characters in the code.""" | ||
|
||
import logging | ||
import re | ||
|
||
from macaron.config.defaults import defaults | ||
from macaron.json_tools import JsonType | ||
from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer | ||
from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics | ||
from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset | ||
|
||
logger: logging.Logger = logging.getLogger(__name__) | ||
|
||
|
||
class WhiteSpacesAnalyzer(BaseHeuristicAnalyzer): | ||
"""Check whether the code has successive white spaces or invisible characters.""" | ||
|
||
INVISIBLE_CHARS = [ | ||
"\u200b", | ||
"\u200c", | ||
"\u200d", | ||
"\ufeff", | ||
"\u200e", | ||
"\u200f", | ||
"\u00a0", | ||
"\u00ad", | ||
" ", | ||
] | ||
|
||
def __init__(self) -> None: | ||
super().__init__( | ||
name="white_spaces_analyzer", | ||
heuristic=Heuristics.WHITE_SPACES, | ||
depends_on=None, | ||
) | ||
|
||
self.repeated_spaces_threshold = self._load_defaults() | ||
|
||
def _load_defaults(self) -> int: | ||
"""Load default settings from defaults.ini. | ||
|
||
Returns | ||
------- | ||
int: | ||
The repeated spaces threshold. | ||
""" | ||
section_name = "heuristic.pypi" | ||
default_threshold = 50 | ||
|
||
if defaults.has_section(section_name): | ||
section = defaults[section_name] | ||
value_str = section.get("repeated_spaces_threshold", fallback=str(default_threshold)) | ||
if value_str is not None and value_str.isdigit(): | ||
return int(value_str) | ||
return default_threshold | ||
|
||
return default_threshold | ||
|
||
def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]: | ||
"""Analyze the package. | ||
|
||
Parameters | ||
---------- | ||
pypi_package_json: PyPIPackageJsonAsset | ||
The PyPI package JSON asset object. | ||
|
||
Returns | ||
------- | ||
tuple[HeuristicResult, dict[str, JsonType]]: | ||
The result and related information collected during the analysis. | ||
""" | ||
scripts: dict[str, str] | None = pypi_package_json.get_sourcecode() | ||
if scripts is None: | ||
return HeuristicResult.SKIP, {} | ||
|
||
for file, content in scripts.items(): | ||
if file.endswith(".py") and self.has_white_spaces(content): | ||
return HeuristicResult.FAIL, { | ||
"file": file, | ||
} | ||
return HeuristicResult.PASS, {} | ||
|
||
def has_white_spaces(self, code_string: str) -> bool: | ||
"""Check for excessive or invisible whitespace characters in a code string. | ||
|
||
Parameters | ||
---------- | ||
code_string: str | ||
The code string to check. | ||
|
||
Returns | ||
------- | ||
bool: | ||
True if suspicious patterns are found, False otherwise. | ||
""" | ||
char_class = "".join(self.INVISIBLE_CHARS) | ||
regex_pattern = f"[{char_class}]{{{self.repeated_spaces_threshold},}}" | ||
if re.search(regex_pattern, code_string, re.DOTALL): | ||
return True | ||
return False |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,6 +27,7 @@ | |
from macaron.malware_analyzer.pypi_heuristics.metadata.wheel_absence import WheelAbsenceAnalyzer | ||
from macaron.malware_analyzer.pypi_heuristics.pypi_sourcecode_analyzer import PyPISourcecodeAnalyzer | ||
from macaron.malware_analyzer.pypi_heuristics.sourcecode.suspicious_setup import SuspiciousSetupAnalyzer | ||
from macaron.malware_analyzer.pypi_heuristics.sourcecode.white_spaces import WhiteSpacesAnalyzer | ||
from macaron.slsa_analyzer.analyze_context import AnalyzeContext | ||
from macaron.slsa_analyzer.checks.base_check import BaseCheck | ||
from macaron.slsa_analyzer.checks.check_result import CheckResultData, CheckResultType, Confidence, JustificationType | ||
|
@@ -332,6 +333,7 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: | |
SuspiciousSetupAnalyzer, | ||
WheelAbsenceAnalyzer, | ||
AnomalousVersionAnalyzer, | ||
WhiteSpacesAnalyzer, | ||
] | ||
|
||
# name used to query the result of all problog rules, so it can be accessed outside the model. | ||
|
@@ -381,6 +383,10 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: | |
failed({Heuristics.CLOSER_RELEASE_JOIN_DATE.value}), | ||
forceSetup. | ||
|
||
% Package released with excessive whitespace in the code . | ||
{Confidence.HIGH.value}::trigger(malware_high_confidence_4) :- | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you walk me through the rationale of why we should combine There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. because it implies, not only that harmful code may be executed during the installation process, but also that there appears to be an effort to hide the form of this code which strongly suggests malicious motives. |
||
forceSetup, failed({Heuristics.WHITE_SPACES.value}). | ||
|
||
% Package released recently with little detail, with multiple releases as a trust marker, but frequent and with | ||
% the same code. | ||
{Confidence.MEDIUM.value}::trigger(malware_medium_confidence_1) :- | ||
|
@@ -401,6 +407,7 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: | |
{problog_result_access} :- trigger(malware_high_confidence_1). | ||
{problog_result_access} :- trigger(malware_high_confidence_2). | ||
{problog_result_access} :- trigger(malware_high_confidence_3). | ||
{problog_result_access} :- trigger(malware_high_confidence_4). | ||
{problog_result_access} :- trigger(malware_medium_confidence_2). | ||
{problog_result_access} :- trigger(malware_medium_confidence_1). | ||
query({problog_result_access}). | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. | ||
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. | ||
|
||
"""Tests for the WhiteSpacesAnalyzer heuristic.""" | ||
# pylint: disable=redefined-outer-name | ||
|
||
|
||
from unittest.mock import MagicMock | ||
|
||
import pytest | ||
|
||
from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult | ||
from macaron.malware_analyzer.pypi_heuristics.sourcecode.white_spaces import WhiteSpacesAnalyzer | ||
|
||
|
||
@pytest.fixture() | ||
def analyzer() -> WhiteSpacesAnalyzer: | ||
"""Pytest fixture to create a WhiteSpacesAnalyzer instance.""" | ||
analyzer_instance = WhiteSpacesAnalyzer() | ||
return analyzer_instance | ||
|
||
|
||
def test_analyze_no_sourcecode(analyzer: WhiteSpacesAnalyzer, pypi_package_json: MagicMock) -> None: | ||
"""Test the analyzer skips when there is no source code.""" | ||
pypi_package_json.get_sourcecode.return_value = None | ||
result, info = analyzer.analyze(pypi_package_json) | ||
assert result == HeuristicResult.SKIP | ||
assert info == {} | ||
|
||
|
||
def test_analyze_pass(analyzer: WhiteSpacesAnalyzer, pypi_package_json: MagicMock) -> None: | ||
"""Test the analyzer passes when no suspicious whitespace is found.""" | ||
pypi_package_json.get_sourcecode.return_value = {"test.py": "print('hello')"} | ||
result, info = analyzer.analyze(pypi_package_json) | ||
assert result == HeuristicResult.PASS | ||
assert info == {} | ||
|
||
|
||
def test_analyze_fail_long_spaces(analyzer: WhiteSpacesAnalyzer, pypi_package_json: MagicMock) -> None: | ||
"""Test the analyzer fails when long spaces are found.""" | ||
repeated_spaces_threshold = analyzer.repeated_spaces_threshold | ||
code = f"print('hello')\n{' ' * (repeated_spaces_threshold + 1)}print('world')" | ||
pypi_package_json.get_sourcecode.return_value = {"test.py": code} | ||
result, info = analyzer.analyze(pypi_package_json) | ||
assert result == HeuristicResult.FAIL | ||
assert info["file"] == "test.py" | ||
|
||
|
||
def test_analyze_fail_invisible_chars(analyzer: WhiteSpacesAnalyzer, pypi_package_json: MagicMock) -> None: | ||
"""Test the analyzer fails when invisible characters are found.""" | ||
repeated_spaces_threshold = analyzer.repeated_spaces_threshold | ||
invisible_char = "\u200b" # Zero-width space. | ||
code = f"print('hello'){invisible_char * repeated_spaces_threshold}print('world')" | ||
pypi_package_json.get_sourcecode.return_value = {"test.py": code} | ||
result, info = analyzer.analyze(pypi_package_json) | ||
assert result == HeuristicResult.FAIL | ||
assert info["file"] == "test.py" | ||
|
||
|
||
def test_has_white_spaces_long_spaces(analyzer: WhiteSpacesAnalyzer) -> None: | ||
"""Test has_white_spaces method with long spaces.""" | ||
repeated_spaces_threshold = analyzer.repeated_spaces_threshold | ||
code = f"print('hello')\n{' ' * repeated_spaces_threshold}print('world')" | ||
assert analyzer.has_white_spaces(code) | ||
|
||
|
||
def test_has_white_spaces_no_suspicious(analyzer: WhiteSpacesAnalyzer) -> None: | ||
"""Test has_white_spaces method with no suspicious whitespace.""" | ||
code = "print('hello')\nprint('world')" | ||
assert not analyzer.has_white_spaces(code) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Small typo, "The" instead of "THe"