Multiprocessing refactoring (#713)

babenek · web-flow · commit a7051b76666a · 2025-05-12T11:56:09.000+03:00
* migration to ProcessPoolExecutor

* style+fix

* lazyload InferenceSession

* log info for scan and test

* Removed extra

* custom BM

* [skip actions] [mlval] 2025-05-06T15:11:31+03:00

* rollback to multiprocessing

* testfix

* style

* small fixes

* Linter fix

* startswith a tuple

* Simplified

* add works faster

* doc upd

* logging optimization

* test fix

* style

* typization
diff --git a/.flake8 b/.flake8
@@ -1,4 +1,4 @@
 [flake8]
 max-line-length = 120
-extend-ignore = E203,E303,E131,E402
+extend-ignore = E402
 per-file-ignores = __init__.py:F401
diff --git a/credsweeper/app.py b/credsweeper/app.py
@@ -15,11 +15,13 @@
 from credsweeper.config import Config
 from credsweeper.credentials import Candidate, CredentialManager, CandidateKey
 from credsweeper.deep_scanner.deep_scanner import DeepScanner
+from credsweeper.file_handler.content_provider import ContentProvider
 from credsweeper.file_handler.diff_content_provider import DiffContentProvider
 from credsweeper.file_handler.file_path_extractor import FilePathExtractor
 from credsweeper.file_handler.abstract_provider import AbstractProvider
 from credsweeper.file_handler.text_content_provider import TextContentProvider
 from credsweeper.scanner import Scanner
+from credsweeper.ml_model.ml_validator import MlValidator
 from credsweeper.utils import Util
 
 logger = logging.getLogger(__name__)
@@ -94,7 +96,7 @@ def __init__(self,
             log_level: str - level for pool initializer according logging levels (UPPERCASE)
 
         """
-        self.pool_count: int = int(pool_count) if int(pool_count) > 1 else 1
+        self.pool_count: int = max(1, int(pool_count))
         if not (_severity := Severity.get(severity)):
             raise RuntimeError(f"Severity level provided: {severity}"
                                f" -- must be one of: {' | '.join([i.value for i in Severity])}")
@@ -123,9 +125,9 @@ def __init__(self,
         self.ml_config = ml_config
         self.ml_model = ml_model
         self.ml_providers = ml_providers
-        self.ml_validator = None
         self.__thrifty = thrifty
         self.__log_level = log_level
+        self.__ml_validator: Optional[MlValidator] = None
 
     # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 
@@ -182,35 +184,22 @@ def _use_ml_validation(self) -> bool:
 
     # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 
-    # the import cannot be done on top due
-    # TypeError: cannot pickle 'onnxruntime.capi.onnxruntime_pybind11_state.InferenceSession' object
-    from credsweeper.ml_model import MlValidator
-
-    # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
-
     @property
     def ml_validator(self) -> MlValidator:
         """ml_validator getter"""
-        from credsweeper.ml_model import MlValidator
         if not self.__ml_validator:
-            self.__ml_validator: MlValidator = MlValidator(
+            self.__ml_validator = MlValidator(
                 threshold=self.ml_threshold,  #
                 ml_config=self.ml_config,  #
                 ml_model=self.ml_model,  #
                 ml_providers=self.ml_providers,  #
             )
-        assert self.__ml_validator, "self.__ml_validator was not initialized"
+        if not self.__ml_validator:
+            raise RuntimeError("MlValidator was not initialized!")
         return self.__ml_validator
 
     # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 
-    @ml_validator.setter
-    def ml_validator(self, _ml_validator: Optional[MlValidator]) -> None:
-        """ml_validator setter"""
-        self.__ml_validator = _ml_validator
-
-    # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
-
     @staticmethod
     def pool_initializer(log_kwargs) -> None:
         """Ignore SIGINT in child processes."""
@@ -219,20 +208,6 @@ def pool_initializer(log_kwargs) -> None:
 
     # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 
-    @property
-    def config(self) -> Config:
-        """config getter"""
-        return self.__config
-
-    # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
-
-    @config.setter
-    def config(self, config: Config) -> None:
-        """config setter"""
-        self.__config = config
-
-    # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
-
     def run(self, content_provider: AbstractProvider) -> int:
         """Run an analysis of 'content_provider' object.
 
@@ -241,9 +216,10 @@ def run(self, content_provider: AbstractProvider) -> int:
 
         """
         _empty_list: Sequence[Union[DiffContentProvider, TextContentProvider]] = []
-        file_extractors: Sequence[Union[DiffContentProvider, TextContentProvider]] = \
-            content_provider.get_scannable_files(self.config) if content_provider else _empty_list
-        logger.info(f"Start Scanner for {len(file_extractors)} providers")
+        file_extractors = content_provider.get_scannable_files(self.config) if content_provider else _empty_list
+        if not file_extractors:
+            logger.info(f"No scannable targets for {len(content_provider.paths)} paths")
+            return 0
         self.scan(file_extractors)
         self.post_processing()
         # PatchesProvider has the attribute. Circular import error appears with using the isinstance
@@ -260,7 +236,7 @@ def scan(self, content_providers: Sequence[Union[DiffContentProvider, TextConten
             content_providers: file objects to scan
 
         """
-        if 1 < self.pool_count:
+        if 1 < self.pool_count and 1 < len(content_providers):
             self.__multi_jobs_scan(content_providers)
         else:
             self.__single_job_scan(content_providers)
@@ -269,6 +245,7 @@ def scan(self, content_providers: Sequence[Union[DiffContentProvider, TextConten
 
     def __single_job_scan(self, content_providers: Sequence[Union[DiffContentProvider, TextContentProvider]]) -> None:
         """Performs scan in main thread"""
+        logger.info(f"Scan for {len(content_providers)} providers")
         all_cred = self.files_scan(content_providers)
         self.credential_manager.set_credentials(all_cred)
 
@@ -284,12 +261,14 @@ def __multi_jobs_scan(self, content_providers: Sequence[Union[DiffContentProvide
             if "SILENCE" == self.__log_level:
                 logging.addLevelName(60, "SILENCE")
             log_kwargs["level"] = self.__log_level
-        with multiprocessing.get_context("spawn").Pool(processes=self.pool_count,
-                                                       initializer=self.pool_initializer,
+        pool_count = min(self.pool_count, len(content_providers))
+        logger.info(f"Scan in {pool_count} processes for {len(content_providers)} providers")
+        with multiprocessing.get_context("spawn").Pool(processes=pool_count,
+                                                       initializer=CredSweeper.pool_initializer,
                                                        initargs=(log_kwargs, )) as pool:
             try:
-                for scan_results in pool.imap_unordered(self.files_scan, (content_providers[x::self.pool_count]
-                                                                          for x in range(self.pool_count))):
+                for scan_results in pool.imap_unordered(self.files_scan,
+                                                        (content_providers[x::pool_count] for x in range(pool_count))):
                     for cred in scan_results:
                         self.credential_manager.add_credential(cred)
             except KeyboardInterrupt:
@@ -301,9 +280,7 @@ def __multi_jobs_scan(self, content_providers: Sequence[Union[DiffContentProvide
 
     # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 
-    def files_scan(
-            self,  #
-            content_providers: Sequence[Union[DiffContentProvider, TextContentProvider]]) -> List[Candidate]:
+    def files_scan(self, content_providers: Sequence[ContentProvider]) -> List[Candidate]:
         """Auxiliary method for scan one sequence"""
         all_cred: List[Candidate] = []
         for provider in content_providers:
@@ -316,7 +293,7 @@ def files_scan(
 
     # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 
-    def file_scan(self, content_provider: Union[DiffContentProvider, TextContentProvider]) -> List[Candidate]:
+    def file_scan(self, content_provider: ContentProvider) -> List[Candidate]:
         """Run scanning of file from 'file_provider'.
 
         Args:
diff --git a/credsweeper/filters/value_base64_encoded_pem_check.py b/credsweeper/filters/value_base64_encoded_pem_check.py
@@ -30,7 +30,7 @@ def run(self, line_data: LineData, target: AnalysisTarget) -> bool:
         with contextlib.suppress(Exception):
             text = Util.decode_base64(line_data.value, padding_safe=True, urlsafe_detect=True)
             lines = text.decode(ASCII).splitlines()
-            lines_pos = [x for x in range(len(lines))]
+            lines_pos = list(range(len(lines)))
             for line_pos, line in zip(lines_pos, lines):
                 if PEM_BEGIN_PATTERN in line:
                     new_target = AnalysisTarget(line_pos, lines, lines_pos, target.descriptor)
diff --git a/credsweeper/ml_model/ml_validator.py b/credsweeper/ml_model/ml_validator.py
@@ -1,10 +1,11 @@
 import hashlib
+import json
 import logging
 from pathlib import Path
 from typing import List, Tuple, Union, Optional, Dict
 
 import numpy as np
-import onnxruntime as ort
+from onnxruntime import InferenceSession
 
 import credsweeper.ml_model.features as features
 from credsweeper.common.constants import ThresholdPreset, ML_HUNK
@@ -22,6 +23,8 @@ class MlValidator:
     # applied for unknown characters
     FAKE_CHAR = '\x01'
 
+    _dir_path = Path(__file__).parent
+
     def __init__(
             self,  #
             threshold: Union[float, ThresholdPreset],  #
@@ -36,35 +39,36 @@ def __init__(
             ml_model: path to ml model
             ml_providers: coma separated list of providers https://onnxruntime.ai/docs/execution-providers/
         """
-        dir_path = Path(__file__).parent
+        self.__session: Optional[InferenceSession] = None
 
         if ml_config:
             ml_config_path = Path(ml_config)
         else:
-            ml_config_path = dir_path / "ml_config.json"
+            ml_config_path = MlValidator._dir_path / "ml_config.json"
         with open(ml_config_path, "rb") as f:
-            md5_config = hashlib.md5(f.read()).hexdigest()
+            __ml_config_data = f.read()
+
+        model_config = json.loads(__ml_config_data)
 
         if ml_model:
             ml_model_path = Path(ml_model)
         else:
-            ml_model_path = dir_path / "ml_model.onnx"
+            ml_model_path = MlValidator._dir_path / "ml_model.onnx"
         with open(ml_model_path, "rb") as f:
-            md5_model = hashlib.md5(f.read()).hexdigest()
+            self.__ml_model_data = f.read()
 
         if ml_providers:
-            providers = ml_providers.split(',')
+            self.providers = ml_providers.split(',')
         else:
-            providers = ["CPUExecutionProvider"]
-        self.model_session = ort.InferenceSession(ml_model_path, providers=providers)
+            self.providers = ["CPUExecutionProvider"]
 
-        model_config = Util.json_load(ml_config_path)
         if isinstance(threshold, float):
             self.threshold = threshold
         elif isinstance(threshold, ThresholdPreset) and "thresholds" in model_config:
             self.threshold = model_config["thresholds"][threshold.value]
         else:
             self.threshold = 0.5
+            logger.warning(f"Use fallback threshold value: {self.threshold}")
 
         char_set = set(model_config["char_set"])
         if len(char_set) != len(model_config["char_set"]):
@@ -80,26 +84,44 @@ def __init__(
 
         self.common_feature_list = []
         self.unique_feature_list = []
-        logger.info("Init ML validator with %s provider; config:'%s' md5:%s model:'%s' md5:%s", providers,
-                    ml_config_path, md5_config, ml_model_path, md5_model)
-        logger.debug("ML validator details: %s", model_config)
+        if logger.isEnabledFor(logging.INFO):
+            config_dbg = str(model_config) if logger.isEnabledFor(logging.DEBUG) else ''
+            config_md5 = hashlib.md5(__ml_config_data).hexdigest()
+            model_md5 = hashlib.md5(self.__ml_model_data).hexdigest()
+            logger.info("Init ML validator with providers: '%s' ; model:'%s' md5:%s ; config:'%s' md5:%s ; %s",
+                        self.providers, ml_config_path, config_md5, ml_model_path, model_md5, config_dbg)
         for feature_definition in model_config["features"]:
             feature_class = feature_definition["type"]
             kwargs = feature_definition.get("kwargs", {})
             feature_constructor = getattr(features, feature_class, None)
             if feature_constructor is None:
-                raise ValueError(f'Error while parsing model details. Cannot create feature "{feature_class}"')
+                raise ValueError(f"Error while parsing model details. Cannot create feature '{feature_class}'"
+                                 f" from {feature_definition}")
             try:
                 feature = feature_constructor(**kwargs)
             except TypeError:
-                logger.error(f'Error while parsing model details. Cannot create feature "{feature_class}"'
-                             f' with kwargs "{kwargs}"')
+                logger.error(f"Error while parsing model details. Cannot create feature '{feature_class}'"
+                             f" from {feature_definition}")
                 raise
             if feature_definition["type"] in ["RuleName"]:
                 self.unique_feature_list.append(feature)
             else:
                 self.common_feature_list.append(feature)
 
+    def __reduce__(self):
+        # TypeError: cannot pickle 'onnxruntime.capi.onnxruntime_pybind11_state.InferenceSession' object
+        self.__session = None
+        return super().__reduce__()
+
+    @property
+    def session(self) -> InferenceSession:
+        """session getter to prevent pickle error"""
+        if not self.__session:
+            self.__session = InferenceSession(self.__ml_model_data, providers=self.providers)
+        if not self.__session:
+            raise RuntimeError("InferenceSession was not initialized!")
+        return self.__session
+
     def encode(self, text: str, limit: int) -> np.ndarray:
         """Encodes prepared text to array"""
         result_array: np.ndarray = np.zeros(shape=(limit, self.num_classes), dtype=np.float32)
@@ -136,7 +158,7 @@ def _call_model(self, line_input: np.ndarray, variable_input: np.ndarray, value_
             "value_input": value_input.astype(np.float32),
             "feature_input": feature_input.astype(np.float32),
         }
-        result = self.model_session.run(output_names=None, input_feed=input_feed)
+        result = self.session.run(output_names=None, input_feed=input_feed)
         if result and isinstance(result[0], np.ndarray):
             return result[0]
         raise RuntimeError(f"Unexpected type {type(result[0])}")
@@ -178,8 +200,8 @@ def get_group_features(self, candidates: List[Candidate]) -> Tuple[np.ndarray, n
         default_candidate = candidates[0]
         line_input = self.encode_line(default_candidate.line_data_list[0].line,
                                       default_candidate.line_data_list[0].value_start)[np.newaxis]
-        variable = ""
-        value = ""
+        variable = ''
+        value = ''
         for candidate in candidates:
             if not variable and candidate.line_data_list[0].variable:
                 variable = candidate.line_data_list[0].variable
@@ -251,8 +273,8 @@ def validate_groups(self, group_list: List[Tuple[CandidateKey, List[Candidate]]]
                                                             features_list)
         is_cred = probability > self.threshold
         if logger.isEnabledFor(logging.DEBUG):
-            for i, _ in enumerate(is_cred):
-                logger.debug("ML decision: %s with prediction: %s for value: %s", is_cred[i], probability[i],
+            for i, decision in enumerate(is_cred):
+                logger.debug("ML decision: %s with prediction: %s for value: %s", decision, probability[i],
                              group_list[i][0])
         # apply cast to float to avoid json export issue
         return is_cred, probability.astype(float)
diff --git a/credsweeper/utils/pem_key_detector.py b/credsweeper/utils/pem_key_detector.py
@@ -126,7 +126,7 @@ def sanitize_line(cls, line: str, recurse_level: int = 5) -> str:
         line = line.strip(string.whitespace)
         if line.startswith("//"):
             # simplify first condition for speed-up of doxygen style processing
-            if line.startswith("// ") or line.startswith("/// "):
+            if line.startswith(("// ", "/// ")):
                 # Assume that the commented line is to be separated from base64 code, it may be a part of PEM, otherwise
                 line = line[3:]
         if line.startswith("/*"):
diff --git a/credsweeper/utils/util.py b/credsweeper/utils/util.py
@@ -465,7 +465,7 @@ def is_jks(data: Union[bytes, bytearray]) -> bool:
     def is_lzma(data: Union[bytes, bytearray]) -> bool:
         """According https://en.wikipedia.org/wiki/List_of_file_signatures - lzma also xz"""
         if isinstance(data, (bytes, bytearray)) and 6 <= len(data):
-            if data.startswith(b"\xFD\x37\x7A\x58\x5A\x00") or data.startswith(b"\x5D\x00\x00"):
+            if data.startswith((b"\xFD\x37\x7A\x58\x5A\x00", b"\x5D\x00\x00")):
                 return True
         return False
 
diff --git a/docs/source/credsweeper.deep_scanner.rst b/docs/source/credsweeper.deep_scanner.rst
@@ -28,6 +28,14 @@ credsweeper.deep\_scanner.bzip2\_scanner module
    :undoc-members:
    :show-inheritance:
 
+credsweeper.deep\_scanner.deb\_scanner module
+---------------------------------------------
+
+.. automodule:: credsweeper.deep_scanner.deb_scanner
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 credsweeper.deep\_scanner.deep\_scanner module
 ----------------------------------------------
 
diff --git a/tests/test_app.py b/tests/test_app.py
@@ -732,8 +732,10 @@ def test_external_ml_n(self) -> None:
     # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 
     def test_external_ml_p(self) -> None:
-        log_pattern = re.compile(
-            r".*Init ML validator with .+ provider; config:'.+' md5:([0-9a-f]{32}) model:'.+' md5:([0-9a-f]{32})")
+        log_pattern = re.compile(r".*Init ML validator with providers: \S+ ;"
+                                 r" model:'.+' md5:([0-9a-f]{32}) ;"
+                                 r" config:'.+' md5:([0-9a-f]{32}) ;"
+                                 r" .*")
         _stdout, _stderr = self._m_credsweeper(["--path", str(APP_PATH), "--log", "INFO"])
         self.assertEqual(0, len(_stderr))
         self.assertNotIn("CRITICAL", _stdout)
diff --git a/tests/test_main.py b/tests/test_main.py