Skip to content

Commit d387cc3

Browse files
authored
Merge pull request #63 from NetherlandsForensicInstitute/add_check_if_contains
Added check if contains option.
2 parents 20a85ea + 9748884 commit d387cc3

File tree

4 files changed

+74
-2
lines changed

4 files changed

+74
-2
lines changed

bin/demeuk.py

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,8 @@
6666
with as comma-seperated list.
6767
--check-ending-with <string> Drop lines ending with string, can be multiple strings. Specify multiple
6868
with as comma-seperated list.
69+
--check-contains <string> Drop lines containing string, can be multiple strings. Specify multiple
70+
with as comma-seperated list.
6971
--check-empty-line Drop lines that are empty or only contain whitespace characters
7072
--check-regex <string> Drop lines that do not match the regex. Regex is a comma seperated list of
7173
regexes. Example: [a-z]{1,8},[0-9]{1,8}
@@ -169,7 +171,7 @@
169171
from unidecode import unidecode
170172

171173

172-
version = '4.4.0'
174+
version = '4.5.0'
173175

174176
# Search from start to finish for the string $HEX[], with block of a-f0-9 with even number
175177
# of hex chars. The first match group is repeated.
@@ -615,6 +617,23 @@ def check_ending_with(line, strings):
615617
return False
616618

617619

620+
def check_contains(line, strings):
621+
"""Checks if a line does not contain specific strings
622+
623+
Params:
624+
line (unicode)
625+
strings[str]
626+
627+
Returns:
628+
true if line does contain any one of the strings
629+
630+
"""
631+
for string in strings:
632+
if string in line:
633+
return True
634+
return False
635+
636+
618637
def check_empty_line(line):
619638
"""Checks if a line is empty or only contains whitespace chars
620639
@@ -1231,6 +1250,12 @@ def clean_up(lines):
12311250
log.append(f'Check_ending_with; dropped line because {to_check} found; {line_decoded}{linesep}')
12321251
stop = True
12331252

1253+
if config.get('check-contains') and not stop:
1254+
to_check = config.get("check-contains")
1255+
if check_contains(line_decoded, to_check):
1256+
log.append(f'Check-contains; dropped line because {to_check} found; {line_decoded}{linesep}')
1257+
stop = True
1258+
12341259
if config.get('check-empty-line') and not stop:
12351260
if check_empty_line(line_decoded):
12361261
log_line = "Check_empty_line; dropped line because is empty or only contains whitespace;"
@@ -1390,6 +1415,7 @@ def main():
13901415
'check-starting-with': False,
13911416
'check-uuid': False,
13921417
'check-ending-with': False,
1418+
'check-contains': False,
13931419
'check-empty-line': False,
13941420
'check-regex': False,
13951421
'check-min-digits': 0,
@@ -1550,6 +1576,12 @@ def main():
15501576
else:
15511577
config['check-ending-with'] = [arguments.get('--check-ending-with')]
15521578

1579+
if arguments.get('--check-contains'):
1580+
if ',' in arguments.get('--check-contains'):
1581+
config['check-contains'] = arguments.get('--check-contains').split(',')
1582+
else:
1583+
config['check-contains'] = [arguments.get('--check-contains')]
1584+
15531585
if arguments.get('--check-empty-line'):
15541586
config['check-empty-line'] = True
15551587

docs/usage.rst

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -292,14 +292,24 @@ characters are transfered to ':'.
292292
check-ending-with
293293
~~~~~~~~~~~~~~~~~
294294
Checks if a line ends with the argument of check-ending-with. If the line ends
295-
with this, the line will be dropped. The string to check can be multiple strings. multiple
295+
with this, the line will be dropped. The string to check can be multiple strings. Multiple
296296
values are comma-seperated. Example: #,// would skip lines ending with '#' and with
297297
'//'.
298298

299299
If you enabled the '--tab' option and you want to drop lines ending with a tab, add
300300
':' to the list of strings to check. '--check ending-with :'. When using --tab tab
301301
characters are transfered to ':'.
302302

303+
check-contains
304+
~~~~~~~~~~~~~~
305+
Checks if a line contains the argument of check-contains. If the line contains this,
306+
the line will be dropped. The string to check can be multiple strings. Multiple values
307+
are comma-separated. Example: #,// would skip lines containing '#' and '//'.
308+
309+
If you enabled the '--tab' option and you want to drop lines ending with a tab, add
310+
':' to the list of strings to check. '--check ending-with :'. When using --tab tab
311+
characters are transfered to ':'.
312+
303313
check-empty-line
304314
~~~~~~~~~~~~~~~~
305315
Checks if a line only contains whitespace characters or is empty. If this is true,
@@ -312,6 +322,13 @@ matches all of the regexes, the line will be dropped.
312322
Example: --check-regex '[a-z],[0-9]' will drop lines
313323
that do not atleast contain one lowercase char and one number.
314324

325+
Want to remove a line that does not contain an underscore?
326+
--check-regex '^[^_]+$'
327+
328+
Want to remove a line that start with a specific strings?
329+
--check-regex '^[^this]' will remove lines starting with 'this'
330+
331+
315332
check-min-digits
316333
~~~~~~~~~~~~~~~~
317334
Checks if a line contains a minimum number of digit characters. If the line does not contain

tests/conftest.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -387,3 +387,9 @@
387387
file.write(f'amsterdam {linesep}')
388388
file.write(f'ROTTERDAM {linesep}')
389389
file.write(f'Cookie Monster {linesep}')
390+
391+
with open('testdata/input53', 'w') as file:
392+
file.write(f'three_down {linesep}')
393+
file.write(f'_amsterdam {linesep}')
394+
file.write(f'ROTTERDAM_ {linesep}')
395+
file.write(f'Cookie Monster {linesep}')

tests/test_app.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -965,3 +965,20 @@ def test_add_title_case():
965965
assert 'Amsterdam' in filecontent
966966
assert 'Rotterdam' in filecontent
967967
assert 'Cookie Monster' in filecontent
968+
969+
970+
def test_check_contains():
971+
testargs = [
972+
'demeuk', '-i', 'testdata/input53', '-o', 'testdata/output53', '-l', 'testdata/log53',
973+
'--verbose', '--check-contains', '_',
974+
]
975+
with patch.object(sys, 'argv', testargs):
976+
main()
977+
978+
with open('testdata/output53') as f:
979+
filecontent = f.read()
980+
981+
assert 'three_down' not in filecontent
982+
assert '_amsterdam' not in filecontent
983+
assert 'ROTTERDAM_' not in filecontent
984+
assert 'Cookie Monster' in filecontent

0 commit comments

Comments
 (0)