Skip to content

Commit c8ddb62

Browse files
authored
LzmaScanner for deep scan (#705)
1 parent f281bd5 commit c8ddb62

File tree

11 files changed

+498
-13
lines changed

11 files changed

+498
-13
lines changed

credsweeper/deep_scanner/deep_scanner.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import logging
33
from typing import List, Optional, Any, Tuple, Union
44

5-
from credsweeper.common.constants import RECURSIVE_SCAN_LIMITATION
5+
from credsweeper.common.constants import RECURSIVE_SCAN_LIMITATION, MIN_DATA_LEN
66
from credsweeper.config import Config
77
from credsweeper.credentials import Candidate
88
from credsweeper.credentials.augment_candidates import augment_candidates
@@ -23,6 +23,7 @@
2323
from .html_scanner import HtmlScanner
2424
from .jks_scanner import JksScanner
2525
from .lang_scanner import LangScanner
26+
from .lzma_scanner import LzmaScanner
2627
from .mxfile_scanner import MxfileScanner
2728
from .pdf_scanner import PdfScanner
2829
from .pkcs12_scanner import Pkcs12Scanner
@@ -48,6 +49,7 @@ class DeepScanner(
4849
HtmlScanner, #
4950
JksScanner, #
5051
LangScanner, #
52+
LzmaScanner, #
5153
PdfScanner, #
5254
Pkcs12Scanner, #
5355
PptxScanner, #
@@ -106,6 +108,9 @@ def get_deep_scanners(data: bytes, file_type: str, depth: int) -> Tuple[List[Any
106108
elif Util.is_bzip2(data):
107109
if 0 < depth:
108110
deep_scanners.append(Bzip2Scanner)
111+
elif Util.is_lzma(data):
112+
if 0 < depth:
113+
deep_scanners.append(LzmaScanner)
109114
elif Util.is_tar(data):
110115
if 0 < depth:
111116
deep_scanners.append(TarScanner)
@@ -239,15 +244,18 @@ def recursive_scan(
239244
recursive_limit_size: maximal bytes of opened files to prevent recursive zip-bomb attack
240245
"""
241246
candidates: List[Candidate] = []
242-
logger.debug("Start data_scan: size=%d, depth=%d, limit=%d, path=%s, info=%s", len(data_provider.data), depth,
243-
recursive_limit_size, data_provider.file_path, data_provider.info)
244-
245247
if 0 > depth:
246248
# break recursion if maximal depth is reached
247-
logger.debug("bottom reached %s recursive_limit_size:%d", data_provider.file_path, recursive_limit_size)
249+
logger.debug("Bottom reached %s recursive_limit_size:%d", data_provider.file_path, recursive_limit_size)
248250
return candidates
249-
250251
depth -= 1
252+
if MIN_DATA_LEN > len(data_provider.data):
253+
# break recursion if maximal depth is reached
254+
logger.debug("Too small data: size=%d, depth=%d, limit=%d, path=%s, info=%s", len(data_provider.data),
255+
depth, recursive_limit_size, data_provider.file_path, data_provider.info)
256+
return candidates
257+
logger.debug("Start data_scan: size=%d, depth=%d, limit=%d, path=%s, info=%s", len(data_provider.data), depth,
258+
recursive_limit_size, data_provider.file_path, data_provider.info)
251259

252260
if FilePathExtractor.is_find_by_ext_file(self.config, data_provider.file_type):
253261
# Skip scanning file and makes fake candidate due the extension is suspicious
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
import logging
2+
import lzma
3+
from abc import ABC
4+
from pathlib import Path
5+
from typing import List, Optional
6+
7+
from credsweeper.credentials import Candidate
8+
from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
9+
from credsweeper.file_handler.data_content_provider import DataContentProvider
10+
from credsweeper.utils import Util
11+
12+
logger = logging.getLogger(__name__)
13+
14+
15+
class LzmaScanner(AbstractScanner, ABC):
16+
"""Implements lzma scanning"""
17+
18+
def data_scan(
19+
self, #
20+
data_provider: DataContentProvider, #
21+
depth: int, #
22+
recursive_limit_size: int) -> Optional[List[Candidate]]:
23+
"""Extracts data from lzma archive and launches data_scan"""
24+
try:
25+
file_path = Path(data_provider.file_path)
26+
new_path = file_path.as_posix()
27+
if ".xz" == file_path.suffix:
28+
new_path = new_path[:-3]
29+
elif ".lzma" == file_path.suffix:
30+
new_path = new_path[:-5]
31+
lzma_content_provider = DataContentProvider(data=lzma.decompress(data_provider.data),
32+
file_path=new_path,
33+
file_type=Util.get_extension(new_path),
34+
info=f"{data_provider.info}|LZMA:{file_path}")
35+
new_limit = recursive_limit_size - len(lzma_content_provider.data)
36+
lzma_candidates = self.recursive_scan(lzma_content_provider, depth, new_limit)
37+
return lzma_candidates
38+
except Exception as lzma_exc:
39+
logger.error(f"{data_provider.file_path}:{lzma_exc}")
40+
return None

credsweeper/secret/config.json

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,13 @@
22
"exclude": {
33
"pattern": [],
44
"containers": [
5+
".aar",
56
".apk",
67
".bz2",
78
".gz",
9+
".lzma",
810
".tar",
11+
".xz",
912
".zip"
1013
],
1114
"documents": [
@@ -20,17 +23,20 @@
2023
],
2124
"extension": [
2225
".7z",
26+
".a",
2327
".aac",
24-
".aar",
2528
".avi",
29+
".bin",
2630
".bmp",
2731
".class",
2832
".css",
2933
".dmg",
3034
".ear",
3135
".eot",
36+
".elf",
3237
".exe",
3338
".gif",
39+
".gmo",
3440
".ico",
3541
".img",
3642
".info",
@@ -45,17 +51,21 @@
4551
".mp4",
4652
".npy",
4753
".npz",
54+
".obj",
4855
".ogg",
4956
".pak",
5057
".png",
5158
".psd",
5259
".pyc",
5360
".pyd",
5461
".pyo",
62+
".rar",
5563
".rc",
5664
".rc2",
5765
".rar",
5866
".realm",
67+
".res",
68+
".rpm",
5969
".s7z",
6070
".scss",
6171
".so",
@@ -70,6 +80,7 @@
7080
".wav",
7181
".webm",
7282
".webp",
83+
".wma",
7384
".woff",
7485
".yuv"
7586
],

credsweeper/utils/util.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,7 @@ def is_known(data: bytes) -> bool:
161161
or Util.is_gzip(data) \
162162
or Util.is_tar(data) \
163163
or Util.is_bzip2(data) \
164+
or Util.is_lzma(data) \
164165
or Util.is_com(data) \
165166
or Util.is_pdf(data) \
166167
or Util.is_elf(data):
@@ -459,6 +460,14 @@ def is_jks(data: bytes) -> bool:
459460
return True
460461
return False
461462

463+
@staticmethod
464+
def is_lzma(data: bytes) -> bool:
465+
"""According https://en.wikipedia.org/wiki/List_of_file_signatures - lzma also xz"""
466+
if isinstance(data, bytes) and 6 <= len(data):
467+
if data.startswith(b"\xFD\x37\x7A\x58\x5A\x00") or data.startswith(b"\x5D\x00\x00"):
468+
return True
469+
return False
470+
462471
@staticmethod
463472
def is_asn1(data: bytes) -> bool:
464473
"""Only sequence type 0x30 and size correctness is checked"""

experiment/src/data_loader.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,10 +43,7 @@ def read_detected_data(file_path: str) -> Dict[identifier, Dict]:
4343
# skip not ML values like private keys and so on. Unsupported for ml train. "use_ml" rules ONLY
4444
assert 0 < len(cred["line_data_list"]), cred # at least, one line_data_list must present
4545
line_data = deepcopy(cred["line_data_list"][0])
46-
if hasattr(line_data, "entropy_validation"):
47-
line_data.pop("entropy_validation")
48-
if hasattr(line_data, "entropy"):
49-
line_data.pop("entropy")
46+
line_data.pop("entropy")
5047
line_data.pop("info")
5148
line_data["line"] = None # will be read during join_label with data for ML input only
5249
meta_path = transform_to_meta_path(line_data["path"])

fuzz/re-fuzzing.sh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,12 @@ find $PARENTDIR/tests/samples/* -type f -print0 | while IFS= read -r -d '' f; do
2121
rm -vf $PARENTDIR/$CORPUS_DIR/$s.bz2
2222
bzip2 -k $PARENTDIR/$CORPUS_DIR/$s
2323
mv -vf $PARENTDIR/$CORPUS_DIR/$s.bz2 $PARENTDIR/$CORPUS_DIR/$(sha1sum $PARENTDIR/$CORPUS_DIR/$s.bz2 | cut -c-40)
24+
rm -vf $PARENTDIR/$CORPUS_DIR/$s.xz
25+
xz -z -k $PARENTDIR/$CORPUS_DIR/$s
26+
mv -vf $PARENTDIR/$CORPUS_DIR/$s.xz $PARENTDIR/$CORPUS_DIR/$(sha1sum $PARENTDIR/$CORPUS_DIR/$s.xz | cut -c-40)
27+
rm -vf $PARENTDIR/$CORPUS_DIR/$s.lzma
28+
lzma -z -k $PARENTDIR/$CORPUS_DIR/$s
29+
mv -vf $PARENTDIR/$CORPUS_DIR/$s.lzma $PARENTDIR/$CORPUS_DIR/$(sha1sum $PARENTDIR/$CORPUS_DIR/$s.lzma | cut -c-40)
2430
# produce zip archive with simple file names
2531
rm -vf $PARENTDIR/$CORPUS_DIR/$s.zip
2632
zip -j -9 -D $PARENTDIR/$CORPUS_DIR/$s.zip $f

tests/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from pathlib import Path
22

33
# total number of files in test samples
4-
SAMPLES_FILES_COUNT = 151
4+
SAMPLES_FILES_COUNT = 153
55

66
# the lowest value of ML threshold is used to display possible lowest values
77
NEGLIGIBLE_ML_THRESHOLD = 0.0001
@@ -20,7 +20,7 @@
2020
SAMPLES_IN_DOC = 778
2121

2222
# archived credentials that are not found without --depth
23-
SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 92
23+
SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 98
2424
SAMPLES_IN_DEEP_2 = SAMPLES_IN_DEEP_1 + 5
2525
SAMPLES_IN_DEEP_3 = SAMPLES_IN_DEEP_2 + 1
2626

0 commit comments

Comments
 (0)