Skip to content

Commit 511c2e5

Browse files
[Feature:Plagiarism] Add support for regex directories (#34)
* Add suport for regex directories * Fix python linter error * Check for allowed directories, also fix issue with int/string conversion from the JSON * Add security check * Move error-checking code out of main algorithm Co-authored-by: williamjallen <[email protected]>
1 parent 20377da commit 511c2e5

File tree

2 files changed

+78
-56
lines changed

2 files changed

+78
-56
lines changed

bin/concatenate_all.py

+76-52
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,10 @@
1717
SUBMITTY_DATA_DIR = OPEN_JSON['submitty_data_dir']
1818
SUBMITTY_INSTALL_DIR = OPEN_JSON['submitty_install_dir']
1919

20+
IGNORED_FILES = [
21+
".submit.timestamp"
22+
]
23+
2024

2125
def parse_args():
2226
parser = argparse.ArgumentParser(description="")
@@ -35,73 +39,93 @@ def main():
3539
semester = lichen_config_data["semester"]
3640
course = lichen_config_data["course"]
3741
gradeable = lichen_config_data["gradeable"]
38-
expressions = None
39-
if("regex" in lichen_config_data):
40-
# this assumes regex is seperated by a ','
41-
expressions = lichen_config_data["regex"].split(',')
42+
43+
# this assumes regex is seperated by a ','
44+
regex_expressions = lichen_config_data["regex"].split(',')
45+
regex_dirs = lichen_config_data["regex_dirs"]
46+
47+
for e in regex_expressions:
48+
# Check for backwards crawling
49+
if ".." in e:
50+
print('ERROR! Invalid path component ".." in regex')
51+
exit(1)
4252

4353
# =========================================================================
4454
# error checking
4555
course_dir = os.path.join(SUBMITTY_DATA_DIR, "courses", semester, course)
4656
if not os.path.isdir(course_dir):
4757
print("ERROR! ", course_dir, " is not a valid course directory")
4858
exit(1)
49-
submission_dir = os.path.join(course_dir, "submissions", gradeable)
50-
if not os.path.isdir(submission_dir):
51-
print("ERROR! ", submission_dir, " is not a valid gradeable submissions directory")
52-
exit(1)
5359

5460
# =========================================================================
5561
# create the directory
5662
concatenated_dir = os.path.join(course_dir, "lichen", "concatenated", gradeable)
5763
if not os.path.isdir(concatenated_dir):
5864
os.makedirs(concatenated_dir)
5965

60-
# =========================================================================
61-
# walk the subdirectories
62-
for user in sorted(os.listdir(submission_dir)):
63-
if not os.path.isdir(os.path.join(submission_dir, user)):
64-
continue
65-
for version in sorted(os.listdir(os.path.join(submission_dir, user))):
66-
if not os.path.isdir(os.path.join(submission_dir, user, version)):
67-
continue
66+
for dir in regex_dirs:
67+
if dir not in ["submissions", "results", "checkout"]:
68+
print("ERROR! ", dir, " is not a valid input directory for Lichen")
69+
exit(1)
70+
71+
submission_dir = os.path.join(course_dir, dir, gradeable)
6872

69-
# -----------------------------------------------------------------
70-
# concatenate all files for this submisison into a single file
71-
my_concatenated_dir = os.path.join(concatenated_dir, user, version)
72-
if not os.path.isdir(my_concatenated_dir):
73-
os.makedirs(my_concatenated_dir)
74-
my_concatenated_file = os.path.join(my_concatenated_dir, "submission.concatenated")
75-
total_concat = 0
76-
with open(my_concatenated_file, 'w') as my_cf:
77-
# loop over all files in all subdirectories
78-
base_path = os.path.join(submission_dir, user, version)
79-
for my_dir, _dirs, my_files in os.walk(base_path):
80-
# Determine if regex should be used
81-
files = sorted(my_files)
82-
if expressions is not None:
83-
files_filtered = []
84-
for e in expressions:
85-
files_filtered.extend(fnmatch.filter(files, e.strip()))
86-
files = files_filtered
87-
total_concat += len(files)
88-
for my_file in files:
89-
# skip the timestep
90-
if my_file == ".submit.timestamp":
91-
continue
92-
absolute_path = os.path.join(my_dir, my_file)
93-
# print a separator & filename
94-
with open(absolute_path, encoding='ISO-8859-1') as tmp:
95-
# append the contents of the file
96-
my_cf.write(tmp.read())
97-
# Remove concat file if there no content...
98-
if total_concat == 0:
99-
os.remove(my_concatenated_file)
100-
# FIXME: is this the correct path?
101-
p2 = os.path.join(course_dir, "lichen", "tokenized", gradeable, user, version)
102-
if os.path.isdir(p2):
103-
shutil.rmtree(p2)
104-
os.rmdir(my_concatenated_dir)
73+
# more error checking
74+
if not os.path.isdir(submission_dir):
75+
print("ERROR! ", submission_dir, " is not a valid gradeable ", dir, " directory")
76+
exit(1)
77+
78+
# =========================================================================
79+
# walk the subdirectories
80+
for user in sorted(os.listdir(submission_dir)):
81+
if not os.path.isdir(os.path.join(submission_dir, user)):
82+
continue
83+
for version in sorted(os.listdir(os.path.join(submission_dir, user))):
84+
if not os.path.isdir(os.path.join(submission_dir, user, version)):
85+
continue
86+
87+
# -----------------------------------------------------------------
88+
# concatenate all files for this submisison into a single file
89+
my_concatenated_dir = os.path.join(concatenated_dir, user, version)
90+
if not os.path.isdir(my_concatenated_dir):
91+
os.makedirs(my_concatenated_dir)
92+
my_concatenated_file = os.path.join(my_concatenated_dir, "submission.concatenated")
93+
total_concat = 0
94+
with open(my_concatenated_file, 'a+') as my_cf:
95+
if len(my_cf.read()) > 0:
96+
total_concat = 1
97+
# loop over all files in all subdirectories
98+
base_path = os.path.join(submission_dir, user, version)
99+
for my_dir, _dirs, my_files in os.walk(base_path):
100+
# Determine if regex should be used (no regex provided
101+
# is equivalent to selecting all files)
102+
files = sorted(my_files)
103+
if regex_expressions[0] != "":
104+
files_filtered = []
105+
for e in regex_expressions:
106+
files_filtered.extend(fnmatch.filter(files, e.strip()))
107+
files = files_filtered
108+
109+
total_concat += len(files)
110+
for my_file in files:
111+
# exclude any files we have ignored for all submissions
112+
if my_file in IGNORED_FILES:
113+
continue
114+
absolute_path = os.path.join(my_dir, my_file)
115+
# print a separator & filename
116+
my_cf.write(f"==========={my_file}===========\n")
117+
with open(absolute_path, encoding='ISO-8859-1') as tmp:
118+
# append the contents of the file
119+
my_cf.write(tmp.read())
120+
my_cf.write("\n")
121+
# Remove concat file if there no content...
122+
if total_concat == 0:
123+
os.remove(my_concatenated_file)
124+
# FIXME: is this the correct path?
125+
p2 = os.path.join(course_dir, "lichen", "tokenized", gradeable, user, version)
126+
if os.path.isdir(p2):
127+
shutil.rmtree(p2)
128+
os.rmdir(my_concatenated_dir)
105129

106130
# =========================================================================
107131
# concatenate any files in the provided_code directory

compare_hashes/compare_hashes.cpp

+2-4
Original file line numberDiff line numberDiff line change
@@ -174,10 +174,8 @@ int main(int argc, char* argv[]) {
174174
std::string semester = config_file_json.value("semester","ERROR");
175175
std::string course = config_file_json.value("course","ERROR");
176176
std::string gradeable = config_file_json.value("gradeable","ERROR");
177-
std::string sequence_length_str = config_file_json.value("sequence_length","1");
178-
int sequence_length = std::stoi(sequence_length_str);
179-
std::string threshold_str = config_file_json.value("threshold","5");
180-
int threshold = std::stoi(threshold_str);
177+
int sequence_length = config_file_json.value("sequence_length",1);
178+
int threshold = config_file_json.value("threshold",5);
181179

182180
assert (sequence_length >= 1);
183181
assert (threshold >= 2);

0 commit comments

Comments
 (0)