Skip to content

Commit 5234072

Browse files
authored
Git repo scan tool (#725)
* gitdrilla * git unit test * use predefined git repo * update requirements * win tests fix * windows, python3.12 test fix * diff optimization
1 parent ec24777 commit 5234072

File tree

6 files changed

+501
-49
lines changed

6 files changed

+501
-49
lines changed

credsweeper/__main__.py

Lines changed: 140 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,11 @@
55
import time
66
from argparse import ArgumentParser, ArgumentTypeError, Namespace, BooleanOptionalAction
77
from pathlib import Path
8-
from typing import Any, Union, Dict
8+
from typing import Any, Union, Dict, Tuple, Sequence
99

10-
from credsweeper import __version__
10+
from git import Repo, Commit
11+
12+
from credsweeper import __version__, ByteContentProvider
1113
from credsweeper.app import APP_PATH, CredSweeper
1214
from credsweeper.common.constants import ThresholdPreset, Severity, RuleType, DiffRowType, ML_HUNK
1315
from credsweeper.file_handler.abstract_provider import AbstractProvider
@@ -118,6 +120,11 @@ def get_arguments() -> Namespace:
118120
const="log.yaml",
119121
dest="export_log_config",
120122
metavar="PATH")
123+
group.add_argument("--git", help="git repo to scan", dest="git", metavar="PATH")
124+
parser.add_argument("--ref",
125+
help="scan git repo from the ref, otherwise - all branches were scanned (slow)",
126+
dest="ref",
127+
type=str)
121128
parser.add_argument("--rules",
122129
help="path of rule config file (default: credsweeper/rules/config.yaml). "
123130
f"severity:{[i.value for i in Severity]} "
@@ -246,8 +253,8 @@ def get_arguments() -> Namespace:
246253
default=False)
247254
parser.add_argument("--log",
248255
"-l",
249-
help=f"provide logging level of {list(Logger.LEVELS.keys())}"
250-
f"(default: 'warning', case insensitive)",
256+
help=(f"provide logging level of {list(Logger.LEVELS.keys())}"
257+
f" (default: 'warning', case insensitive)"),
251258
default="warning",
252259
dest="log",
253260
metavar="LOG_LEVEL",
@@ -268,6 +275,39 @@ def get_arguments() -> Namespace:
268275
return parser.parse_args()
269276

270277

278+
def get_credsweeper(args: Namespace) -> CredSweeper:
279+
"""Common function to create the instance"""
280+
if args.denylist_path is not None:
281+
denylist = [line for line in Util.read_file(args.denylist_path) if line]
282+
else:
283+
denylist = []
284+
return CredSweeper(rule_path=args.rule_path,
285+
config_path=args.config_path,
286+
json_filename=args.json_filename,
287+
xlsx_filename=args.xlsx_filename,
288+
stdout=args.stdout,
289+
color=args.color,
290+
hashed=args.hashed,
291+
subtext=args.subtext,
292+
sort_output=args.sort_output,
293+
use_filters=args.no_filters,
294+
pool_count=args.jobs,
295+
ml_batch_size=args.ml_batch_size,
296+
ml_threshold=args.ml_threshold,
297+
ml_config=args.ml_config,
298+
ml_model=args.ml_model,
299+
ml_providers=args.ml_providers,
300+
find_by_ext=args.find_by_ext,
301+
depth=args.depth,
302+
doc=args.doc,
303+
severity=args.severity,
304+
size_limit=args.size_limit,
305+
exclude_lines=denylist,
306+
exclude_values=denylist,
307+
thrifty=args.thrifty,
308+
log_level=args.log)
309+
310+
271311
def scan(args: Namespace, content_provider: AbstractProvider) -> int:
272312
"""Scan content_provider data, print results or save them to json_filename is not None
273313
@@ -283,42 +323,101 @@ def scan(args: Namespace, content_provider: AbstractProvider) -> int:
283323
284324
"""
285325
try:
286-
if args.denylist_path is not None:
287-
denylist = [line for line in Util.read_file(args.denylist_path) if line]
288-
else:
289-
denylist = []
290-
291-
credsweeper = CredSweeper(rule_path=args.rule_path,
292-
config_path=args.config_path,
293-
json_filename=args.json_filename,
294-
xlsx_filename=args.xlsx_filename,
295-
stdout=args.stdout,
296-
color=args.color,
297-
hashed=args.hashed,
298-
subtext=args.subtext,
299-
sort_output=args.sort_output,
300-
use_filters=args.no_filters,
301-
pool_count=args.jobs,
302-
ml_batch_size=args.ml_batch_size,
303-
ml_threshold=args.ml_threshold,
304-
ml_config=args.ml_config,
305-
ml_model=args.ml_model,
306-
ml_providers=args.ml_providers,
307-
find_by_ext=args.find_by_ext,
308-
depth=args.depth,
309-
doc=args.doc,
310-
severity=args.severity,
311-
size_limit=args.size_limit,
312-
exclude_lines=denylist,
313-
exclude_values=denylist,
314-
thrifty=args.thrifty,
315-
log_level=args.log)
326+
credsweeper = get_credsweeper(args)
316327
return credsweeper.run(content_provider=content_provider)
317328
except Exception as exc:
318329
logger.critical(exc, exc_info=True)
330+
logger.exception(exc)
319331
return -1
320332

321333

334+
def get_commit_providers(commit: Commit, repo: Repo) -> Sequence[ByteContentProvider]:
335+
"""Process a commit and for providers"""
336+
result = {}
337+
ancestors = commit.parents or [repo.tree()]
338+
for parent in ancestors:
339+
for diff in parent.diff(commit):
340+
# only result files
341+
blob_b = diff.b_blob
342+
if blob_b and blob_b.path not in result:
343+
try:
344+
result[blob_b.path] = ByteContentProvider(content=blob_b.data_stream.read(),
345+
file_path=str(blob_b.path),
346+
info=DiffRowType.ADDED.value)
347+
except Exception as exc:
348+
logger.warning(f"A submodule was not properly initialized or commit was removed: {exc}")
349+
return list(result.values())
350+
351+
352+
def drill(args: Namespace) -> Tuple[int, int]:
353+
"""Scan repository for branches and commits
354+
Returns:
355+
total credentials found
356+
total scanned commits
357+
"""
358+
total_credentials = 0
359+
total_commits = 0
360+
try:
361+
# repo init first
362+
repo = Repo(args.git)
363+
if args.ref:
364+
commits_sha1 = set(x.commit.hexsha for x in repo.refs if x.name == args.ref)
365+
if not commits_sha1:
366+
commits_sha1 = {args.ref} # single commit sha1 reference
367+
else:
368+
commits_sha1 = set(x.commit.hexsha for x in repo.refs
369+
if x.name.startswith('origin/') or x.name.startswith('refs/heads/'))
370+
logger.info(f"Git repository {args.git} with commits: {commits_sha1}")
371+
# then - credsweeper
372+
credsweeper = get_credsweeper(args)
373+
# use flat iterations to avoid recursive limits
374+
to_scan = list(commits_sha1)
375+
# local speedup for already scanned commits - avoid file system interactive
376+
scanned = set()
377+
while to_scan:
378+
commit_sha1 = to_scan.pop()
379+
if commit_sha1 in scanned:
380+
# the commit was scanned in this launch
381+
continue
382+
commit = repo.commit(commit_sha1)
383+
if commit.parents:
384+
# add parents anyway
385+
to_scan.extend(x.hexsha for x in commit.parents)
386+
# check whether the commit has been checked and the report is present
387+
skip_already_scanned = False
388+
if args.json_filename:
389+
json_path = Path(args.json_filename)
390+
json_path = json_path.with_suffix(f".{commit_sha1}{json_path.suffix}")
391+
if json_path.exists():
392+
skip_already_scanned = True
393+
else:
394+
credsweeper.json_filename = json_path
395+
if args.xlsx_filename:
396+
xlsx_path = Path(args.xlsx_filename)
397+
xlsx_path = xlsx_path.with_suffix(f".{commit_sha1}{xlsx_path.suffix}")
398+
if xlsx_path.exists():
399+
skip_already_scanned = True
400+
else:
401+
credsweeper.xlsx_filename = xlsx_path
402+
if skip_already_scanned:
403+
logger.info("Skip already scanned commit: %s", commit_sha1)
404+
continue
405+
logger.info("Scan commit: %s", commit_sha1)
406+
# prepare all files to scan in the commit with bytes->IO transformation to avoid a multiprocess issue
407+
if providers := get_commit_providers(commit, repo):
408+
credsweeper.credential_manager.candidates.clear()
409+
credsweeper.scan(providers)
410+
credsweeper.post_processing()
411+
credsweeper.export_results()
412+
total_credentials += credsweeper.credential_manager.len_credentials()
413+
total_commits += 1
414+
scanned.add(commit_sha1)
415+
except Exception as exc:
416+
logger.critical(exc, exc_info=True)
417+
return -1, total_commits
418+
return total_credentials, total_commits
419+
420+
322421
def main() -> int:
323422
"""Main function"""
324423
result = EXIT_FAILURE
@@ -328,7 +427,7 @@ def main() -> int:
328427
if args.banner:
329428
print(f"CredSweeper {__version__} crc32:{check_integrity():08x}")
330429
Logger.init_logging(args.log, args.log_config_path)
331-
logger.info(f"Init CredSweeper object with arguments: {args}")
430+
logger.info(f"Init CredSweeper object with arguments: {args} CWD: {os.getcwd()}")
332431
summary: Dict[str, int] = {}
333432
if args.path:
334433
logger.info(f"Run analyzer on path: {args.path}")
@@ -353,6 +452,12 @@ def main() -> int:
353452
result = EXIT_SUCCESS
354453
# collect number of all found credential to produce error code when necessary
355454
credentials_number = add_credentials_number + del_credentials_number
455+
elif args.git:
456+
logger.info(f"Run analyzer on GIT: {args.git}")
457+
credentials_number, commits_number = drill(args)
458+
summary[f"Detected Credentials in {args.git} for {commits_number} commits "] = credentials_number
459+
if 0 <= credentials_number:
460+
result = EXIT_SUCCESS
356461
elif args.export_config:
357462
logging.info(f"Exporting default config to file: {args.export_config}")
358463
config_dict = Util.json_load(APP_PATH / "secret" / "config.json")

credsweeper/app.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,8 @@
1616
from credsweeper.credentials import Candidate, CredentialManager, CandidateKey
1717
from credsweeper.deep_scanner.deep_scanner import DeepScanner
1818
from credsweeper.file_handler.content_provider import ContentProvider
19-
from credsweeper.file_handler.diff_content_provider import DiffContentProvider
2019
from credsweeper.file_handler.file_path_extractor import FilePathExtractor
2120
from credsweeper.file_handler.abstract_provider import AbstractProvider
22-
from credsweeper.file_handler.text_content_provider import TextContentProvider
2321
from credsweeper.scanner import Scanner
2422
from credsweeper.ml_model.ml_validator import MlValidator
2523
from credsweeper.utils import Util
@@ -215,7 +213,7 @@ def run(self, content_provider: AbstractProvider) -> int:
215213
content_provider: path objects to scan
216214
217215
"""
218-
_empty_list: Sequence[Union[DiffContentProvider, TextContentProvider]] = []
216+
_empty_list: Sequence[ContentProvider] = []
219217
file_extractors = content_provider.get_scannable_files(self.config) if content_provider else _empty_list
220218
if not file_extractors:
221219
logger.info(f"No scannable targets for {len(content_provider.paths)} paths")
@@ -229,7 +227,7 @@ def run(self, content_provider: AbstractProvider) -> int:
229227

230228
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
231229

232-
def scan(self, content_providers: Sequence[Union[DiffContentProvider, TextContentProvider]]) -> None:
230+
def scan(self, content_providers: Sequence[ContentProvider]) -> None:
233231
"""Run scanning of files from an argument "content_providers".
234232
235233
Args:
@@ -243,15 +241,15 @@ def scan(self, content_providers: Sequence[Union[DiffContentProvider, TextConten
243241

244242
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
245243

246-
def __single_job_scan(self, content_providers: Sequence[Union[DiffContentProvider, TextContentProvider]]) -> None:
244+
def __single_job_scan(self, content_providers: Sequence[ContentProvider]) -> None:
247245
"""Performs scan in main thread"""
248246
logger.info(f"Scan for {len(content_providers)} providers")
249247
all_cred = self.files_scan(content_providers)
250248
self.credential_manager.set_credentials(all_cred)
251249

252250
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
253251

254-
def __multi_jobs_scan(self, content_providers: Sequence[Union[DiffContentProvider, TextContentProvider]]) -> None:
252+
def __multi_jobs_scan(self, content_providers: Sequence[ContentProvider]) -> None:
255253
"""Performs scan with multiple jobs"""
256254
# use this separation to satisfy YAPF formatter
257255
yapfix = "%(asctime)s | %(levelname)s | %(processName)s:%(threadName)s | %(filename)s:%(lineno)s | %(message)s"
@@ -265,7 +263,7 @@ def __multi_jobs_scan(self, content_providers: Sequence[Union[DiffContentProvide
265263
logger.info(f"Scan in {pool_count} processes for {len(content_providers)} providers")
266264
with multiprocessing.get_context("spawn").Pool(processes=pool_count,
267265
initializer=CredSweeper.pool_initializer,
268-
initargs=(log_kwargs, )) as pool:
266+
initargs=(log_kwargs,)) as pool: # yapf: disable
269267
try:
270268
for scan_results in pool.imap_unordered(self.files_scan,
271269
(content_providers[x::pool_count] for x in range(pool_count))):

docs/source/guide.rst

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@ Get all argument list:
1414
.. code-block:: text
1515
1616
usage: python -m credsweeper [-h]
17-
(--path PATH [PATH ...] | --diff_path PATH [PATH ...] | --export_config [PATH] | --export_log_config [PATH])
18-
[--rules PATH] [--severity SEVERITY]
17+
(--path PATH [PATH ...] | --diff_path PATH [PATH ...] | --export_config [PATH] | --export_log_config [PATH] | --git PATH)
18+
[--ref REF] [--rules PATH] [--severity SEVERITY]
1919
[--config PATH] [--log_config PATH]
2020
[--denylist PATH] [--find-by-ext]
2121
[--depth POSITIVE_INT] [--no-filters] [--doc]
@@ -43,6 +43,9 @@ Get all argument list:
4343
--export_log_config [PATH]
4444
exporting default logger config to file (default:
4545
log.yaml)
46+
--git PATH git repo to scan
47+
--ref REF scan git repo from the ref, otherwise - all branches
48+
were scanned (slow)
4649
--rules PATH path of rule config file (default:
4750
credsweeper/rules/config.yaml). severity:['critical',
4851
'high', 'medium', 'low', 'info'] type:['keyword',
@@ -93,8 +96,8 @@ Get all argument list:
9396
--sort, --no-sort enable output sorting (default: False)
9497
--log LOG_LEVEL, -l LOG_LEVEL
9598
provide logging level of ['DEBUG', 'INFO', 'WARN',
96-
'WARNING', 'ERROR', 'FATAL', 'CRITICAL',
97-
'SILENCE'](default: 'warning', case insensitive)
99+
'WARNING', 'ERROR', 'FATAL', 'CRITICAL', 'SILENCE']
100+
(default: 'warning', case insensitive)
98101
--size_limit SIZE_LIMIT
99102
set size limit of files that for scanning (eg. 1GB /
100103
10MiB / 1000)

requirements.txt

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,13 @@ twine==6.1.0
1313
base58==2.1.1
1414
beautifulsoup4==4.13.4
1515
colorama==0.4.6
16-
cryptography==45.0.2
16+
cryptography==45.0.3
1717
GitPython==3.1.44
1818
humanfriendly==10.0
1919
lxml==5.4.0
2020
numpy==1.24.4; python_version < '3.10'
21-
numpy==2.2.6; python_version >= '3.10'
21+
numpy==2.2.6; python_version == '3.10'
22+
numpy==2.3.0; python_version > '3.10'
2223
odfpy==1.4.1
2324
xlrd==2.0.1
2425

@@ -30,7 +31,7 @@ onnxruntime==1.22.0; python_version >= '3.10'
3031
openpyxl==3.1.5
3132

3233
# pandas - ML requirement and excel data reading
33-
pandas==2.2.3
34+
pandas==2.3.0
3435

3536
pdfminer.six==20250324
3637
pybase62==1.0.0

tests/test_app.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,9 @@ def test_it_works_n(self) -> None:
200200
" | --diff_path PATH [PATH ...]" \
201201
" | --export_config [PATH]" \
202202
" | --export_log_config [PATH]" \
203+
" | --git PATH" \
203204
")" \
205+
" [--ref REF]" \
204206
" [--rules PATH]" \
205207
" [--severity SEVERITY]" \
206208
" [--config PATH]" \
@@ -235,6 +237,7 @@ def test_it_works_n(self) -> None:
235237
" --diff_path" \
236238
" --export_config" \
237239
" --export_log_config" \
240+
" --git" \
238241
" is required "
239242
expected = " ".join(expected.split())
240243
self.assertEqual(expected, output)

0 commit comments

Comments
 (0)