Skip to content

Commit 32a29fb

Browse files
authored
Markup check for unknown extensions (#259)
* markup_corrections * add extension verification check
1 parent 29f89e5 commit 32a29fb

File tree

6 files changed

+245
-25
lines changed

6 files changed

+245
-25
lines changed

.ci/benchmark.txt

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
META MD5 c7902b7e94ed36807aa76f49bc5c5b41
2-
DATA MD5 8eb069c8e5d44914fd6dccbcbd6428cc
3-
DATA: 17230760 interested lines. MARKUP: 62332 items
1+
META MD5 95346ca85ce4d3fcefd03fe87803bff8
2+
DATA MD5 c58c9fab721263eac09dbe0929c62cdd
3+
DATA: 17230760 interested lines. MARKUP: 62334 items
44
FileType FileNumber ValidLines Positives Negatives
55
--------------- ------------ ------------ ----------- -----------
66
689 567668 138 487
@@ -64,7 +64,7 @@ FileType FileNumber ValidLines Positives Negatives
6464
.gd 1 37 1
6565
.gml 3 3075 16
6666
.gni 3 5017 19
67-
.go 1280 718792 1514 4861
67+
.go 1280 718792 1514 4862
6868
.golden 5 1168 1 42
6969
.gradle 50 4295 8 189
7070
.graphql 8 454 2 13
@@ -159,7 +159,7 @@ FileType FileNumber ValidLines Positives Negatives
159159
.pug 2 193 2
160160
.purs 1 69 4
161161
.pxd 1 150 2 4
162-
.py 904 297181 750 3934
162+
.py 904 297181 750 3935
163163
.pyi 4 1361 9
164164
.pyp 1 167 1
165165
.python 1 213
@@ -226,10 +226,10 @@ FileType FileNumber ValidLines Positives Negatives
226226
.xib 11 503 164
227227
.xsl 1 311 1
228228
.yaml 168 24422 195 379
229-
.yml 564 57042 1934 1217
229+
.yml 564 57042 1933 1218
230230
.zsh 6 872 12
231231
.zsh-theme 1 97 1
232-
TOTAL: 11640 17230760 17511 50243
232+
TOTAL: 11640 17230760 17510 50246
233233
credsweeper result_cnt : 0, lost_cnt : 0, true_cnt : 0, false_cnt : 0
234234
Rules Positives Negatives Reported TP FP TN FN FPR FNR ACC PRC RCL F1
235235
------------------------------ ----------- ----------- ---------- ---- ---- ----- ----- -------- -------- -------- ----- -------- ----
@@ -274,7 +274,7 @@ Nonce 130 55 0 0
274274
OTP / 2FA Secret 58 3 0 0 3 58 0.000000 1.000000 0.049180 0.000000
275275
Other 9 7321 0 0 7321 9 0.000000 1.000000 0.998772 0.000000
276276
PEM Private Key 1150 76 0 0 76 1150 0.000000 1.000000 0.061990 0.000000
277-
Password 2575 9932 0 0 9932 2575 0.000000 1.000000 0.794115 0.000000
277+
Password 2574 9935 0 0 9935 2574 0.000000 1.000000 0.794228 0.000000
278278
Postman Credentials 2 0 0 0 0 2 1.000000 0.000000 0.000000
279279
SQL Password 44 14 0 0 14 44 0.000000 1.000000 0.241379 0.000000
280280
Salesforce Credentials 6 0 0 0 0 6 1.000000 0.000000 0.000000
@@ -287,4 +287,4 @@ Token 1138 4668 0 0
287287
Twilio Credentials 30 39 0 0 39 30 0.000000 1.000000 0.565217 0.000000
288288
URL Credentials 225 382 0 0 382 225 0.000000 1.000000 0.629325 0.000000
289289
UUID 2508 280 0 0 280 2508 0.000000 1.000000 0.100430 0.000000
290-
17511 50243 0 0 0 50243 17511 0.000000 1.000000 0.741550 0.000000
290+
17510 50246 0 0 0 50246 17510 0.000000 1.000000 0.741573 0.000000

config.json

Lines changed: 202 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,202 @@
1+
{
2+
"exclude": {
3+
"pattern": [],
4+
"containers": [
5+
".aar",
6+
".apk",
7+
".bz2",
8+
".class",
9+
".gz",
10+
".jar",
11+
".lzma",
12+
".rpm",
13+
".tar",
14+
".war",
15+
".whl",
16+
".xz",
17+
".zip"
18+
],
19+
"documents": [
20+
".doc",
21+
".docx",
22+
".odp",
23+
".ods",
24+
".odt",
25+
".pdf",
26+
".ppt",
27+
".pptx",
28+
".xls",
29+
".xlsx"
30+
],
31+
"extension": [
32+
".7z",
33+
".a",
34+
".aac",
35+
".avi",
36+
".bin",
37+
".bmp",
38+
".css",
39+
".dmg",
40+
".ear",
41+
".eot",
42+
".elf",
43+
".exe",
44+
".gif",
45+
".gmo",
46+
".ico",
47+
".img",
48+
".info",
49+
".jpeg",
50+
".jpg",
51+
".lib",
52+
".map",
53+
".m4a",
54+
".mat",
55+
".mo",
56+
".mov",
57+
".mp3",
58+
".mp4",
59+
".mpg",
60+
".mkv",
61+
".npy",
62+
".npz",
63+
".obj",
64+
".oga",
65+
".ogg",
66+
".ogv",
67+
".ops",
68+
".pak",
69+
".png",
70+
".psd",
71+
".pyc",
72+
".pyd",
73+
".pyo",
74+
".rar",
75+
".rc",
76+
".rc2",
77+
".realm",
78+
".res",
79+
".s7z",
80+
".scss",
81+
".so",
82+
".sum",
83+
".svg",
84+
".swf",
85+
".tif",
86+
".tiff",
87+
".tlb",
88+
".ttf",
89+
".vcxproj",
90+
".vdproj",
91+
".wav",
92+
".webm",
93+
".webp",
94+
".wma",
95+
".woff",
96+
".woff2",
97+
".yuv"
98+
],
99+
"path": [
100+
"/.git/",
101+
"/.idea/",
102+
"/.svn/",
103+
"/__pycache__/",
104+
"/node_modules/",
105+
"/target/",
106+
"/.venv/",
107+
"/venv/"
108+
],
109+
"lines": [],
110+
"values": []
111+
},
112+
"source_ext": [
113+
".aspx",
114+
".cs",
115+
".cshtml",
116+
".ejs",
117+
".erb",
118+
".go",
119+
".html",
120+
".ipynb",
121+
".jsp",
122+
".jsx",
123+
".php",
124+
".phtml",
125+
".rb",
126+
".sh",
127+
".swift",
128+
".ts",
129+
".twig",
130+
".vue",
131+
".xhtml",
132+
".java",
133+
".js",
134+
".py",
135+
".cpp",
136+
".c",
137+
".h",
138+
".hpp",
139+
".mm",
140+
".cu",
141+
".y",
142+
".vb",
143+
".m",
144+
".cu"
145+
],
146+
"source_quote_ext": [
147+
".cs",
148+
".cc",
149+
".php",
150+
".tf",
151+
".kt",
152+
".go",
153+
".ipynb",
154+
".ts",
155+
".java",
156+
".js",
157+
".py",
158+
".cpp",
159+
".c",
160+
".h",
161+
".hpp"
162+
],
163+
"find_by_ext_list": [
164+
".pem",
165+
".cer",
166+
".csr",
167+
".der",
168+
".pfx",
169+
".p12",
170+
".key",
171+
".jks"
172+
],
173+
"bruteforce_list": [
174+
"",
175+
"changeit",
176+
"changeme",
177+
"tizen"
178+
],
179+
"check_for_literals": true,
180+
"max_password_value_length": 64,
181+
"max_url_cred_value_length": 80,
182+
"line_data_output": [
183+
"line",
184+
"line_num",
185+
"path",
186+
"info",
187+
"variable",
188+
"variable_start",
189+
"variable_end",
190+
"value",
191+
"value_start",
192+
"value_end",
193+
"entropy"
194+
],
195+
"candidate_output": [
196+
"rule",
197+
"severity",
198+
"confidence",
199+
"ml_probability",
200+
"line_data_list"
201+
]
202+
}

meta/22548287.csv

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,8 +66,8 @@ Id,FileID,Domain,RepoName,FilePath,LineStart,LineEnd,GroundTruth,ValueStart,Valu
6666
55582,bb8bcb6e,GitHub,22548287,data/22548287/spec/model/bb8bcb6e.rb,26,26,F,,,,,Token
6767
55583,bb8bcb6e,GitHub,22548287,data/22548287/spec/model/bb8bcb6e.rb,10,10,F,,,,,Password
6868
101442,aca9b509,GitHub,22548287,data/22548287/sample/aca9b509.env,3,3,F,,,,,Secret
69-
101443,b9c876aa,GitHub,22548287,data/22548287/conf/local/b9c876aa.yml,16,16,T,21,29,,,Password
69+
101443,b9c876aa,GitHub,22548287,data/22548287/conf/local/b9c876aa.yml,16,16,F,,,,LABEL,Password
7070
110652,b0bd5063,GitHub,22548287,data/22548287/conf/local/b0bd5063.yml,10,10,F,,,,,Auth
7171
110653,77f993cc,GitHub,22548287,data/22548287/lib/77f993cc.rake,26,26,F,,,,,Password
72-
130501,b9c876aa,GitHub,22548287,data/22548287/conf/local/b9c876aa.yml,25,25,F,,,,,Password
72+
130501,b9c876aa,GitHub,22548287,data/22548287/conf/local/b9c876aa.yml,25,25,F,,,,LABEL,Password
7373
1481106,7a8338ef,GitHub,22548287,data/22548287/conf/local/7a8338ef.yml,30,30,F,17,184,,,Token

meta/a15774b8.csv

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -709,3 +709,4 @@ Id,FileID,Domain,RepoName,FilePath,LineStart,LineEnd,GroundTruth,ValueStart,Valu
709709
1509875,804a8f8d,GitHub,a15774b8,data/a15774b8/sample/cluster/804a8f8d.rsp,454,454,T,35,45,,,Password
710710
1520916,64c4dfcb,GitHub,a15774b8,data/a15774b8/conf/64c4dfcb,209,209,F,38,43,,,CMD Password
711711
11516161,fcf4a37e,GitHub,a15774b8,data/a15774b8/_/fcf4a37e.md,82,82,T,17,25,,,CURL User Password
712+
11524145,8175b361,GitHub,a15774b8,data/a15774b8/_/8175b361.py,127,127,F,,,,,Password

meta/fc8343f4.csv

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -835,3 +835,4 @@ Id,FileID,Domain,RepoName,FilePath,LineStart,LineEnd,GroundTruth,ValueStart,Valu
835835
1525370,cbf683bc,GitHub,fc8343f4,data/fc8343f4/test/src/client/cbf683bc.go,55,55,T,20,44,,,Basic Authorization
836836
1096355,a9119ede,GitHub,fc8343f4,data/fc8343f4/test/src/conf/rest/client/a9119ede.go,655,655,T,1139,1233,,,Key
837837
11503163,cc8ec946,GitHub,fc8343f4,data/fc8343f4/vendor/cc8ec946.go,8647,8647,F,22,30,,,URL Credentials
838+
11524146,5eb7ac2c,GitHub,fc8343f4,data/fc8343f4/test/src/conf/pkg/kube/5eb7ac2c.go,77,77,F,,,,,Password

review_data.py

Lines changed: 30 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,13 @@
77
MAGENTA - templates
88
When value start-end defined - the text is marked
99
"""
10-
10+
import functools
1111
import json
1212
import os
13+
import pathlib
1314
import subprocess
1415
import sys
1516
from argparse import ArgumentParser
16-
from functools import cache
1717
from typing import List, Optional, Tuple, Dict
1818

1919
from colorama import Fore, Back, Style
@@ -25,7 +25,15 @@
2525
EXIT_FAILURE = 1
2626

2727

28-
@cache
28+
@functools.cache
29+
def get_excluding_extensions() -> set[str]:
30+
# copy of CredSweeper/secret/config.json
31+
with open("config.json") as f:
32+
result = json.load(f)
33+
return set(result["exclude"]["containers"] + result["exclude"]["documents"] + result["exclude"]["extension"])
34+
35+
36+
@functools.cache
2937
def read_cache(path) -> list[str]:
3038
with open(path, "r", encoding="utf8") as f:
3139
return f.read().replace("\r\n", '\n').replace('\r', '\n').split('\n')
@@ -115,15 +123,15 @@ def read_data(path, line_start, line_end, value_start, value_end, ground_truth,
115123
f"/.*{path.split('/')[-1]},{line_start},{line_end},.*/d",
116124
f"meta/{repo_id}.csv"])
117125

118-
print("\n\n")
126+
print("\n\n", flush=True)
119127

120128

121-
def main(meta_dir: str,
122-
data_dir: str,
123-
check_only: bool,
124-
data_filter: dict,
125-
load_json: Optional[str] = None,
126-
category: Optional[str] = None) -> int:
129+
def review(meta_dir: str,
130+
data_dir: str,
131+
check_only: bool,
132+
data_filter: dict,
133+
load_json: Optional[str] = None,
134+
category: Optional[str] = None) -> int:
127135
errors = 0
128136
duplicates = 0
129137
if not os.path.exists(meta_dir):
@@ -146,6 +154,11 @@ def main(meta_dir: str,
146154
if category and category not in row.Category.split(':'):
147155
continue
148156

157+
if pathlib.Path(row.FilePath).suffix in get_excluding_extensions():
158+
# the file extension will be excluded during default scan
159+
print(f"File {row.FilePath} is excluded by default config with extension filter!", flush=True)
160+
errors += 1
161+
149162
displayed_rows += 1
150163
if not check_only:
151164
print(str(row), flush=True)
@@ -221,7 +234,7 @@ def main(meta_dir: str,
221234
return result
222235

223236

224-
if __name__ == "__main__":
237+
def main(argv) -> int:
225238
parser = ArgumentParser(prog="python review_data.py",
226239
description="Console script for review markup with colorization")
227240

@@ -233,7 +246,7 @@ def main(meta_dir: str,
233246
parser.add_argument("-X", help="Show X markup", action="store_true")
234247
parser.add_argument("--load", help="Load json report from CredSweeper", nargs='?')
235248
parser.add_argument("--category", help="Filter only with the category", nargs='?')
236-
_args = parser.parse_args()
249+
_args = parser.parse_args(argv[1:])
237250

238251
_data_filter = {"Other": False}
239252
if not _args.T and not _args.F and not _args.X:
@@ -244,8 +257,11 @@ def main(meta_dir: str,
244257
_data_filter["T"] = _args.T
245258
_data_filter["F"] = _args.F
246259
_data_filter["X"] = _args.X
247-
exit_code = main(_args.meta_dir, _args.data_dir, bool(_args.check_only), _data_filter, _args.load, _args.category)
248-
sys.exit(exit_code)
260+
return review(_args.meta_dir, _args.data_dir, bool(_args.check_only), _data_filter, _args.load, _args.category)
261+
262+
263+
if __name__ == """__main__""":
264+
sys.exit(main(sys.argv))
249265

250266
# review generation command
251267
# .venv/bin/python review_data.py meta data >review.$(now).$(git rev-parse HEAD).txt

0 commit comments

Comments
 (0)