Skip to content

Commit 3074497

Browse files
[Refactor:Plagiarism] Redo file structure (#38)
* Initial rewrite of process_all.sh * Update process_all.sh * Make modifications to file paths and add timers * Overhaul concatenate_all.py * Fix python errors * Progress: everything through tokenization finished * Everything works * Add timers * remove unnecessary code * little python changes * William made an oopsie (forgot to deal with provided code) * Fix minor bugs Fix process_all.sh script plus fix spelling issue and prevent hash_all.py from breaking when empty tokenized files are written * Fix permissions issue with provided code editing * Remove typo * Remove unnecessary print statement Co-authored-by: williamjallen <[email protected]>
1 parent b1aeac5 commit 3074497

File tree

5 files changed

+266
-287
lines changed

5 files changed

+266
-287
lines changed

bin/concatenate_all.py

+81-102
Original file line numberDiff line numberDiff line change
@@ -8,52 +8,73 @@
88
import os
99
import json
1010
import sys
11-
import shutil
11+
import time
1212
import fnmatch
1313

14-
CONFIG_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', '..', 'config')
15-
with open(os.path.join(CONFIG_PATH, 'submitty.json')) as open_file:
16-
OPEN_JSON = json.load(open_file)
17-
SUBMITTY_DATA_DIR = OPEN_JSON['submitty_data_dir']
18-
SUBMITTY_INSTALL_DIR = OPEN_JSON['submitty_install_dir']
19-
2014
IGNORED_FILES = [
2115
".submit.timestamp"
2216
]
2317

2418

19+
# returns a string containing the contents of the files which match the regex in the specified dir
20+
def getConcatFilesInDir(input_dir, regex_patterns):
21+
result = ""
22+
for my_dir, _dirs, my_files in os.walk(input_dir):
23+
# Determine if regex should be used (blank regex is equivalent to selecting all files)
24+
files = sorted(my_files)
25+
if regex_patterns[0] != "":
26+
files_filtered = []
27+
for e in regex_patterns:
28+
files_filtered.extend(fnmatch.filter(files, e.strip()))
29+
files = files_filtered
30+
31+
for my_file in files:
32+
# exclude any files we have ignored for all submissions
33+
if my_file in IGNORED_FILES:
34+
continue
35+
absolute_path = os.path.join(my_dir, my_file)
36+
# print a separator & filename
37+
with open(absolute_path, encoding='ISO-8859-1') as tmp:
38+
result += f"=============== {my_file} ===============\n"
39+
# append the contents of the file
40+
result += tmp.read() + "\n"
41+
return result
42+
43+
2544
def parse_args():
2645
parser = argparse.ArgumentParser(description="")
27-
parser.add_argument("config_path")
46+
parser.add_argument("basepath")
47+
parser.add_argument("datapath")
2848
return parser.parse_args()
2949

3050

3151
def main():
52+
start_time = time.time()
3253
args = parse_args()
3354

34-
sys.stdout.write("CONCATENATE ALL...")
55+
sys.stdout.write("CONCATENATE ALL...") # don't want a newline here so can't use print
3556
sys.stdout.flush()
3657

37-
with open(args.config_path) as lichen_config:
38-
lichen_config_data = json.load(lichen_config)
39-
semester = lichen_config_data["semester"]
40-
course = lichen_config_data["course"]
41-
gradeable = lichen_config_data["gradeable"]
42-
users_to_ignore = lichen_config_data["ignore_submissions"]
58+
config_path = os.path.join(args.basepath, "config.json")
59+
if not os.path.isfile(config_path):
60+
print(f"Error: invalid config path provided ({config_path})")
61+
exit(1)
62+
63+
with open(config_path) as config_file:
64+
config = json.load(config_file)
4365

44-
# this assumes regex is seperated by a ','
45-
regex_expressions = lichen_config_data["regex"].split(',')
46-
regex_dirs = lichen_config_data["regex_dirs"]
66+
semester = config["semester"]
67+
course = config["course"]
68+
gradeable = config["gradeable"]
69+
users_to_ignore = config["ignore_submissions"]
70+
regex_patterns = config["regex"].split(',')
71+
regex_dirs = config["regex_dirs"]
4772

4873
# ==========================================================================
49-
# error checking
50-
course_dir = os.path.join(SUBMITTY_DATA_DIR, "courses", semester, course)
51-
if not os.path.isdir(course_dir):
52-
print("ERROR! ", course_dir, " is not a valid course directory")
53-
exit(1)
74+
# Error checking
5475

55-
for e in regex_expressions:
56-
# Check for backwards crawling
76+
# Check for backwards crawling
77+
for e in regex_patterns:
5778
if ".." in e:
5879
print('ERROR! Invalid path component ".." in regex')
5980
exit(1)
@@ -64,99 +85,57 @@ def main():
6485
exit(1)
6586

6687
# ==========================================================================
67-
# create the directory
68-
concatenated_dir = os.path.join(course_dir, "lichen", "concatenated", gradeable)
69-
if not os.path.isdir(concatenated_dir):
70-
os.makedirs(concatenated_dir)
71-
72-
# ==========================================================================
73-
count_total_files = 0
88+
# loop through and concatenate the selected files for each user in this gradeable
7489

7590
for dir in regex_dirs:
76-
submission_dir = os.path.join(course_dir, dir, gradeable)
77-
78-
# more error checking
79-
if not os.path.isdir(submission_dir):
80-
print("ERROR! ", submission_dir, " is not a valid gradeable ", dir, " directory")
81-
exit(1)
82-
83-
# =========================================================================
84-
# walk the subdirectories
85-
for user in sorted(os.listdir(submission_dir)):
86-
if not os.path.isdir(os.path.join(submission_dir, user)):
91+
gradeable_path = os.path.join(args.datapath, semester, course, dir, gradeable)
92+
# loop over each user
93+
for user in sorted(os.listdir(gradeable_path)):
94+
user_path = os.path.join(gradeable_path, user)
95+
if not os.path.isdir(user_path):
8796
continue
8897
elif user in users_to_ignore:
8998
continue
90-
for version in sorted(os.listdir(os.path.join(submission_dir, user))):
91-
if not os.path.isdir(os.path.join(submission_dir, user, version)):
99+
100+
# loop over each version
101+
for version in sorted(os.listdir(user_path)):
102+
version_path = os.path.join(user_path, version)
103+
if not os.path.isdir(version_path):
92104
continue
93105

94-
# -----------------------------------------------------------------
95-
# concatenate all files for this submissison into a single file
96-
my_concatenated_dir = os.path.join(concatenated_dir, user, version)
97-
if not os.path.isdir(my_concatenated_dir):
98-
os.makedirs(my_concatenated_dir)
99-
my_concatenated_file = os.path.join(my_concatenated_dir, "submission.concatenated")
100-
101-
with open(my_concatenated_file, 'a') as my_cf:
102-
# loop over all files in all subdirectories
103-
base_path = os.path.join(submission_dir, user, version)
104-
for my_dir, _dirs, my_files in os.walk(base_path):
105-
# Determine if regex should be used (no regex provided
106-
# is equivalent to selecting all files)
107-
files = sorted(my_files)
108-
if regex_expressions[0] != "":
109-
files_filtered = []
110-
for e in regex_expressions:
111-
files_filtered.extend(fnmatch.filter(files, e.strip()))
112-
files = files_filtered
113-
114-
for my_file in files:
115-
# exclude any files we have ignored for all submissions
116-
if my_file in IGNORED_FILES:
117-
continue
118-
absolute_path = os.path.join(my_dir, my_file)
119-
# print a separator & filename
120-
my_cf.write(f"=============== {my_file} ===============\n")
121-
with open(absolute_path, encoding='ISO-8859-1') as tmp:
122-
# append the contents of the file
123-
my_cf.write(tmp.read())
124-
my_cf.write("\n")
125-
count_total_files += 1
106+
output_file_path = os.path.join(args.basepath, "users", user,
107+
version, "submission.concatenated")
108+
109+
if not os.path.exists(os.path.dirname(output_file_path)):
110+
os.makedirs(os.path.dirname(output_file_path))
111+
112+
# append to concatenated file
113+
with open(output_file_path, "a") as output_file:
114+
concatenated_contents = getConcatFilesInDir(version_path, regex_patterns)
115+
output_file.write(concatenated_contents)
116+
126117
# ==========================================================================
127-
# iterate over all of the created submissions, checking to see if they are
118+
# iterate over all of the created submissions, checking to see if they are empty
128119
# and adding a message to the top if so (to differentiate empty files from errors in the UI)
129-
for user in os.listdir(concatenated_dir):
130-
for version in os.listdir(os.path.join(concatenated_dir, user)):
131-
my_concatenated_file = os.path.join(concatenated_dir,
132-
user, version, "submission.concatenated")
120+
for user in os.listdir(os.path.join(args.basepath, "users")):
121+
user_path = os.path.join(args.basepath, "users", user)
122+
for version in os.listdir(user_path):
123+
version_path = os.path.join(user_path, version)
124+
my_concatenated_file = os.path.join(version_path, "submission.concatenated")
133125
with open(my_concatenated_file, "r+") as my_cf:
134126
if my_cf.read() == "":
135127
my_cf.write("Error: No files matched provided regex in selected directories")
136128

137129
# ==========================================================================
138-
# concatenate any files in the provided_code directory
139-
provided_code_path = os.path.join(course_dir, "lichen", "provided_code", gradeable)
140-
output_dir = os.path.join(course_dir, "lichen", "concatenated",
141-
gradeable, "provided_code", "provided_code")
142-
output_file = os.path.join(output_dir, "submission.concatenated")
143-
144-
if os.path.isdir(provided_code_path) and len(os.listdir(provided_code_path)) != 0:
145-
# If the directory already exists, delete it and make a new one
146-
if os.path.isdir(output_dir):
147-
shutil.rmtree(output_dir)
148-
os.makedirs(output_dir)
149-
150-
with open(output_file, 'w') as of:
151-
# Loop over all of the provided files and concatenate them
152-
for file in sorted(os.listdir(provided_code_path)):
153-
with open(os.path.join(provided_code_path, file), encoding='ISO-8859-1') as tmp:
154-
# append the contents of the file
155-
of.write(tmp.read())
130+
# concatenate provided code
131+
with open(os.path.join(args.basepath, "provided_code",
132+
"submission.concatenated"), "w") as file:
133+
provided_code_files = os.path.join(args.basepath, "provided_code", "files")
134+
file.write(getConcatFilesInDir(provided_code_files, regex_patterns))
156135

157136
# ==========================================================================
158-
print("done")
159-
print(f"{count_total_files} files concatenated")
137+
end_time = time.time()
138+
print("done in " + "%.0f" % (end_time - start_time) + " seconds")
160139

161140

162141
if __name__ == "__main__":

bin/hash_all.py

+46-53
Original file line numberDiff line numberDiff line change
@@ -8,30 +8,22 @@
88
import argparse
99
import os
1010
import json
11+
import time
1112
import sys
1213
import hashlib
1314

14-
CONFIG_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', '..', 'config')
15-
with open(os.path.join(CONFIG_PATH, 'submitty.json')) as open_file:
16-
OPEN_JSON = json.load(open_file)
17-
SUBMITTY_DATA_DIR = OPEN_JSON['submitty_data_dir']
18-
SUBMITTY_INSTALL_DIR = OPEN_JSON['submitty_install_dir']
19-
2015

2116
def parse_args():
2217
parser = argparse.ArgumentParser(description="")
23-
parser.add_argument("config_path")
24-
args = parser.parse_args()
25-
return args
18+
parser.add_argument("basepath")
19+
return parser.parse_args()
2620

2721

28-
def hasher(args, my_tokenized_file, my_hashes_file):
29-
with open(args.config_path) as lichen_config:
30-
lichen_config_data = json.load(lichen_config)
31-
language = lichen_config_data["language"]
32-
sequence_length = int(lichen_config_data["sequence_length"])
22+
def hasher(lichen_config_data, my_tokenized_file, my_hashes_file):
23+
language = lichen_config_data["language"]
24+
sequence_length = int(lichen_config_data["sequence_length"])
3325

34-
data_json_path = os.path.join(SUBMITTY_INSTALL_DIR, "Lichen", "bin", "data.json")
26+
data_json_path = "./data.json" # data.json is in the Lichen/bin directory after install
3527
with open(data_json_path) as token_data_file:
3628
token_data = json.load(token_data_file)
3729
if language not in token_data:
@@ -45,58 +37,59 @@ def hasher(args, my_tokenized_file, my_hashes_file):
4537
with open(my_tokenized_file, 'r', encoding='ISO-8859-1') as my_tf:
4638
with open(my_hashes_file, 'w') as my_hf:
4739
tokens = json.load(my_tf)
48-
token_values = [str(x.get(token_data[language]["token_value"]))
49-
for x in tokens]
50-
num = len(tokens)
51-
# FIXME: this truncation should be adjusted after testing
52-
token_hashed_values = [(hashlib.md5(''.join(
53-
token_values[x:x+sequence_length]).encode())
54-
.hexdigest())[0:8] for x in range(0, num-sequence_length+1)]
40+
# write empty hashes file if the tokens file was empty (such as
41+
# when there is no provided code)
42+
if tokens is not None:
43+
token_values = [str(x[token_data[language]["token_value"]]) for x in tokens]
44+
num = len(tokens)
45+
# FIXME: this truncation should be adjusted after testing
46+
token_hashed_values = [(hashlib.md5(''.join(
47+
token_values[x:x+sequence_length]).encode())
48+
.hexdigest())[0:8] for x in range(0, num-sequence_length+1)]
5549

56-
my_hf.write('\n'.join(token_hashed_values))
50+
my_hf.write('\n'.join(token_hashed_values))
5751

5852

5953
def main():
54+
start_time = time.time()
6055
args = parse_args()
6156

62-
with open(args.config_path) as lichen_config:
57+
with open(os.path.join(args.basepath, "config.json")) as lichen_config:
6358
lichen_config_data = json.load(lichen_config)
64-
semester = lichen_config_data["semester"]
65-
course = lichen_config_data["course"]
66-
gradeable = lichen_config_data["gradeable"]
6759

6860
sys.stdout.write("HASH ALL...")
6961
sys.stdout.flush()
7062

71-
# =========================================================================
72-
# error checking
73-
course_dir = os.path.join(SUBMITTY_DATA_DIR, "courses", semester, course)
74-
if not os.path.isdir(course_dir):
75-
print("ERROR! ", course_dir, " is not a valid course directory")
76-
exit(1)
77-
tokenized_dir = os.path.join(course_dir, "lichen", "tokenized", gradeable)
78-
if not os.path.isdir(tokenized_dir):
79-
print("ERROR! ", tokenized_dir, " is not a valid gradeable tokenized directory")
80-
exit(1)
81-
82-
hashes_dir = os.path.join(course_dir, "lichen", "hashes", gradeable)
83-
8463
# =========================================================================
8564
# walk the subdirectories
86-
for user in sorted(os.listdir(tokenized_dir)):
87-
for version in sorted(os.listdir(os.path.join(tokenized_dir, user))):
88-
my_tokenized_file = os.path.join(tokenized_dir, user, version, "tokens.json")
89-
90-
# =================================================================
91-
# create the directory
92-
my_hashes_dir = os.path.join(hashes_dir, user, version)
93-
if not os.path.isdir(my_hashes_dir):
94-
os.makedirs(my_hashes_dir)
95-
96-
my_hashes_file = os.path.join(my_hashes_dir, "hashes.txt")
97-
hasher(args, my_tokenized_file, my_hashes_file)
65+
users_dir = os.path.join(args.basepath, "users")
66+
if not os.path.isdir(users_dir):
67+
print("Error: Unable to find users directory")
68+
exit(1)
9869

99-
print("done")
70+
for user in sorted(os.listdir(users_dir)):
71+
user_dir = os.path.join(users_dir, user)
72+
if not os.path.isdir(user_dir):
73+
continue
74+
75+
for version in sorted(os.listdir(user_dir)):
76+
my_dir = os.path.join(user_dir, version)
77+
if not os.path.isdir(my_dir):
78+
continue
79+
80+
my_tokenized_file = os.path.join(my_dir, "tokens.json")
81+
my_hashes_file = os.path.join(my_dir, "hashes.txt")
82+
hasher(lichen_config_data, my_tokenized_file, my_hashes_file)
83+
84+
# ===========================================================================
85+
# hash the provided code
86+
provided_code_tokenized = os.path.join(args.basepath, "provided_code", "tokens.json")
87+
provided_code_hashed = os.path.join(args.basepath, "provided_code", "hashes.txt")
88+
hasher(lichen_config_data, provided_code_tokenized, provided_code_hashed)
89+
90+
# ==========================================================================
91+
end_time = time.time()
92+
print("done in " + "%.0f" % (end_time - start_time) + " seconds")
10093

10194

10295
if __name__ == "__main__":

0 commit comments

Comments
 (0)