55import time
66from argparse import ArgumentParser , ArgumentTypeError , Namespace , BooleanOptionalAction
77from pathlib import Path
8- from typing import Any , Union , Dict
8+ from typing import Any , Union , Dict , Tuple , Sequence
99
10- from credsweeper import __version__
10+ from git import Repo , Commit
11+
12+ from credsweeper import __version__ , ByteContentProvider
1113from credsweeper .app import APP_PATH , CredSweeper
1214from credsweeper .common .constants import ThresholdPreset , Severity , RuleType , DiffRowType , ML_HUNK
1315from credsweeper .file_handler .abstract_provider import AbstractProvider
@@ -118,6 +120,11 @@ def get_arguments() -> Namespace:
118120 const = "log.yaml" ,
119121 dest = "export_log_config" ,
120122 metavar = "PATH" )
123+ group .add_argument ("--git" , help = "git repo to scan" , dest = "git" , metavar = "PATH" )
124+ parser .add_argument ("--ref" ,
125+ help = "scan git repo from the ref, otherwise - all branches were scanned (slow)" ,
126+ dest = "ref" ,
127+ type = str )
121128 parser .add_argument ("--rules" ,
122129 help = "path of rule config file (default: credsweeper/rules/config.yaml). "
123130 f"severity:{ [i .value for i in Severity ]} "
@@ -246,8 +253,8 @@ def get_arguments() -> Namespace:
246253 default = False )
247254 parser .add_argument ("--log" ,
248255 "-l" ,
249- help = f"provide logging level of { list (Logger .LEVELS .keys ())} "
250- f" (default: 'warning', case insensitive)" ,
256+ help = ( f"provide logging level of { list (Logger .LEVELS .keys ())} "
257+ f" (default: 'warning', case insensitive)") ,
251258 default = "warning" ,
252259 dest = "log" ,
253260 metavar = "LOG_LEVEL" ,
@@ -268,6 +275,39 @@ def get_arguments() -> Namespace:
268275 return parser .parse_args ()
269276
270277
278+ def get_credsweeper (args : Namespace ) -> CredSweeper :
279+ """Common function to create the instance"""
280+ if args .denylist_path is not None :
281+ denylist = [line for line in Util .read_file (args .denylist_path ) if line ]
282+ else :
283+ denylist = []
284+ return CredSweeper (rule_path = args .rule_path ,
285+ config_path = args .config_path ,
286+ json_filename = args .json_filename ,
287+ xlsx_filename = args .xlsx_filename ,
288+ stdout = args .stdout ,
289+ color = args .color ,
290+ hashed = args .hashed ,
291+ subtext = args .subtext ,
292+ sort_output = args .sort_output ,
293+ use_filters = args .no_filters ,
294+ pool_count = args .jobs ,
295+ ml_batch_size = args .ml_batch_size ,
296+ ml_threshold = args .ml_threshold ,
297+ ml_config = args .ml_config ,
298+ ml_model = args .ml_model ,
299+ ml_providers = args .ml_providers ,
300+ find_by_ext = args .find_by_ext ,
301+ depth = args .depth ,
302+ doc = args .doc ,
303+ severity = args .severity ,
304+ size_limit = args .size_limit ,
305+ exclude_lines = denylist ,
306+ exclude_values = denylist ,
307+ thrifty = args .thrifty ,
308+ log_level = args .log )
309+
310+
271311def scan (args : Namespace , content_provider : AbstractProvider ) -> int :
272312 """Scan content_provider data, print results or save them to json_filename is not None
273313
@@ -283,42 +323,101 @@ def scan(args: Namespace, content_provider: AbstractProvider) -> int:
283323
284324 """
285325 try :
286- if args .denylist_path is not None :
287- denylist = [line for line in Util .read_file (args .denylist_path ) if line ]
288- else :
289- denylist = []
290-
291- credsweeper = CredSweeper (rule_path = args .rule_path ,
292- config_path = args .config_path ,
293- json_filename = args .json_filename ,
294- xlsx_filename = args .xlsx_filename ,
295- stdout = args .stdout ,
296- color = args .color ,
297- hashed = args .hashed ,
298- subtext = args .subtext ,
299- sort_output = args .sort_output ,
300- use_filters = args .no_filters ,
301- pool_count = args .jobs ,
302- ml_batch_size = args .ml_batch_size ,
303- ml_threshold = args .ml_threshold ,
304- ml_config = args .ml_config ,
305- ml_model = args .ml_model ,
306- ml_providers = args .ml_providers ,
307- find_by_ext = args .find_by_ext ,
308- depth = args .depth ,
309- doc = args .doc ,
310- severity = args .severity ,
311- size_limit = args .size_limit ,
312- exclude_lines = denylist ,
313- exclude_values = denylist ,
314- thrifty = args .thrifty ,
315- log_level = args .log )
326+ credsweeper = get_credsweeper (args )
316327 return credsweeper .run (content_provider = content_provider )
317328 except Exception as exc :
318329 logger .critical (exc , exc_info = True )
330+ logger .exception (exc )
319331 return - 1
320332
321333
334+ def get_commit_providers (commit : Commit , repo : Repo ) -> Sequence [ByteContentProvider ]:
335+ """Process a commit and for providers"""
336+ result = {}
337+ ancestors = commit .parents or [repo .tree ()]
338+ for parent in ancestors :
339+ for diff in parent .diff (commit ):
340+ # only result files
341+ blob_b = diff .b_blob
342+ if blob_b and blob_b .path not in result :
343+ try :
344+ result [blob_b .path ] = ByteContentProvider (content = blob_b .data_stream .read (),
345+ file_path = str (blob_b .path ),
346+ info = DiffRowType .ADDED .value )
347+ except Exception as exc :
348+ logger .warning (f"A submodule was not properly initialized or commit was removed: { exc } " )
349+ return list (result .values ())
350+
351+
352+ def drill (args : Namespace ) -> Tuple [int , int ]:
353+ """Scan repository for branches and commits
354+ Returns:
355+ total credentials found
356+ total scanned commits
357+ """
358+ total_credentials = 0
359+ total_commits = 0
360+ try :
361+ # repo init first
362+ repo = Repo (args .git )
363+ if args .ref :
364+ commits_sha1 = set (x .commit .hexsha for x in repo .refs if x .name == args .ref )
365+ if not commits_sha1 :
366+ commits_sha1 = {args .ref } # single commit sha1 reference
367+ else :
368+ commits_sha1 = set (x .commit .hexsha for x in repo .refs
369+ if x .name .startswith ('origin/' ) or x .name .startswith ('refs/heads/' ))
370+ logger .info (f"Git repository { args .git } with commits: { commits_sha1 } " )
371+ # then - credsweeper
372+ credsweeper = get_credsweeper (args )
373+ # use flat iterations to avoid recursive limits
374+ to_scan = list (commits_sha1 )
375+ # local speedup for already scanned commits - avoid file system interactive
376+ scanned = set ()
377+ while to_scan :
378+ commit_sha1 = to_scan .pop ()
379+ if commit_sha1 in scanned :
380+ # the commit was scanned in this launch
381+ continue
382+ commit = repo .commit (commit_sha1 )
383+ if commit .parents :
384+ # add parents anyway
385+ to_scan .extend (x .hexsha for x in commit .parents )
386+ # check whether the commit has been checked and the report is present
387+ skip_already_scanned = False
388+ if args .json_filename :
389+ json_path = Path (args .json_filename )
390+ json_path = json_path .with_suffix (f".{ commit_sha1 } { json_path .suffix } " )
391+ if json_path .exists ():
392+ skip_already_scanned = True
393+ else :
394+ credsweeper .json_filename = json_path
395+ if args .xlsx_filename :
396+ xlsx_path = Path (args .xlsx_filename )
397+ xlsx_path = xlsx_path .with_suffix (f".{ commit_sha1 } { xlsx_path .suffix } " )
398+ if xlsx_path .exists ():
399+ skip_already_scanned = True
400+ else :
401+ credsweeper .xlsx_filename = xlsx_path
402+ if skip_already_scanned :
403+ logger .info ("Skip already scanned commit: %s" , commit_sha1 )
404+ continue
405+ logger .info ("Scan commit: %s" , commit_sha1 )
406+ # prepare all files to scan in the commit with bytes->IO transformation to avoid a multiprocess issue
407+ if providers := get_commit_providers (commit , repo ):
408+ credsweeper .credential_manager .candidates .clear ()
409+ credsweeper .scan (providers )
410+ credsweeper .post_processing ()
411+ credsweeper .export_results ()
412+ total_credentials += credsweeper .credential_manager .len_credentials ()
413+ total_commits += 1
414+ scanned .add (commit_sha1 )
415+ except Exception as exc :
416+ logger .critical (exc , exc_info = True )
417+ return - 1 , total_commits
418+ return total_credentials , total_commits
419+
420+
322421def main () -> int :
323422 """Main function"""
324423 result = EXIT_FAILURE
@@ -328,7 +427,7 @@ def main() -> int:
328427 if args .banner :
329428 print (f"CredSweeper { __version__ } crc32:{ check_integrity ():08x} " )
330429 Logger .init_logging (args .log , args .log_config_path )
331- logger .info (f"Init CredSweeper object with arguments: { args } " )
430+ logger .info (f"Init CredSweeper object with arguments: { args } CWD: { os . getcwd () } " )
332431 summary : Dict [str , int ] = {}
333432 if args .path :
334433 logger .info (f"Run analyzer on path: { args .path } " )
@@ -353,6 +452,12 @@ def main() -> int:
353452 result = EXIT_SUCCESS
354453 # collect number of all found credential to produce error code when necessary
355454 credentials_number = add_credentials_number + del_credentials_number
455+ elif args .git :
456+ logger .info (f"Run analyzer on GIT: { args .git } " )
457+ credentials_number , commits_number = drill (args )
458+ summary [f"Detected Credentials in { args .git } for { commits_number } commits " ] = credentials_number
459+ if 0 <= credentials_number :
460+ result = EXIT_SUCCESS
356461 elif args .export_config :
357462 logging .info (f"Exporting default config to file: { args .export_config } " )
358463 config_dict = Util .json_load (APP_PATH / "secret" / "config.json" )
0 commit comments