Skip to content

Commit ed02455

Browse files
authored
Rough draft of core plagiarism detection process (#5)
* initial tokenize_all script * wip * first draft of hash comparison * wip * wip * fix nested directories in concatenate * improve comments for initial PR * tweak
1 parent fbc44d5 commit ed02455

File tree

7 files changed

+614
-25
lines changed

7 files changed

+614
-25
lines changed

bin/concatenate_all.py

+92
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Walks the submission directory and creates a parallel directory of
4+
the concatenated files.
5+
"""
6+
7+
import argparse
8+
import os
9+
import json
10+
import sys
11+
12+
CONFIG_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', '..', 'config')
13+
with open(os.path.join(CONFIG_PATH, 'submitty.json')) as open_file:
14+
OPEN_JSON = json.load(open_file)
15+
SUBMITTY_DATA_DIR = OPEN_JSON['submitty_data_dir']
16+
SUBMITTY_INSTALL_DIR = OPEN_JSON['submitty_install_dir']
17+
18+
19+
def parse_args():
20+
parser = argparse.ArgumentParser(description="")
21+
parser.add_argument("semester")
22+
parser.add_argument("course")
23+
parser.add_argument("gradeable")
24+
return parser.parse_args()
25+
26+
27+
def main():
28+
args = parse_args()
29+
30+
sys.stdout.write("CONCATENATE ALL...")
31+
sys.stdout.flush()
32+
33+
# ===========================================================================
34+
# error checking
35+
course_dir=os.path.join(SUBMITTY_DATA_DIR,"courses",args.semester,args.course)
36+
if not os.path.isdir(course_dir):
37+
print("ERROR! ",course_dir," is not a valid course directory")
38+
exit(1)
39+
submission_dir=os.path.join(course_dir,"submissions",args.gradeable)
40+
if not os.path.isdir(submission_dir):
41+
print("ERROR! ",submission_dir," is not a valid gradeable submissions directory")
42+
exit(1)
43+
44+
# ===========================================================================
45+
# create the directory
46+
concatenated_dir=os.path.join(course_dir,"lichen","concatenated",args.gradeable)
47+
if not os.path.isdir(concatenated_dir):
48+
os.makedirs(concatenated_dir)
49+
50+
# ===========================================================================
51+
# walk the subdirectories
52+
for user in os.listdir(submission_dir):
53+
if not os.path.isdir(os.path.join(submission_dir,user)):
54+
continue
55+
for version in os.listdir(os.path.join(submission_dir,user)):
56+
if not os.path.isdir(os.path.join(submission_dir,user,version)):
57+
continue
58+
59+
# ---------------------------------------------------------------------
60+
# concatenate all files for this gradeable/user/version into a single file
61+
my_concatenated_dir=os.path.join(concatenated_dir,user,version)
62+
if not os.path.isdir(my_concatenated_dir):
63+
os.makedirs(my_concatenated_dir)
64+
my_concatenated_file=os.path.join(my_concatenated_dir,"submission.concatenated")
65+
with open(my_concatenated_file,'w') as my_cf:
66+
# print a brief header of information
67+
my_cf.write("SEMESTER: "+args.semester+"\n")
68+
my_cf.write("COURSE: "+args.course+"\n")
69+
my_cf.write("GRADEABLE: "+args.gradeable+"\n")
70+
my_cf.write("USER: "+user+"\n")
71+
my_cf.write("VERSION: "+version+"\n")
72+
# loop over all files in all subdirectories
73+
base_path = os.path.join(submission_dir,user,version)
74+
for my_dir,dirs,my_files in os.walk(base_path):
75+
for my_file in sorted(my_files):
76+
# skip the timestep
77+
if my_file == ".submit.timestamp":
78+
continue
79+
# TODO: skip files that should be ignored
80+
absolute_path=os.path.join(my_dir,my_file)
81+
relative_path=absolute_path[len(base_path):]
82+
# print a separator & filename
83+
my_cf.write("----------------------------------------------------\n")
84+
my_cf.write("FILE: "+relative_path+"\n\n")
85+
with open(absolute_path) as tmp:
86+
# append the contents of the file
87+
my_cf.write(tmp.read()+"\n")
88+
89+
print ("done")
90+
91+
if __name__ == "__main__":
92+
main()

bin/hash_all.py

+107
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Walks the submission directory and creates a parallel directory of
4+
the tokenized files.
5+
6+
"""
7+
8+
import argparse
9+
import os
10+
import json
11+
import subprocess
12+
import sys
13+
import json
14+
import hashlib
15+
16+
17+
CONFIG_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', '..', 'config')
18+
with open(os.path.join(CONFIG_PATH, 'submitty.json')) as open_file:
19+
OPEN_JSON = json.load(open_file)
20+
SUBMITTY_DATA_DIR = OPEN_JSON['submitty_data_dir']
21+
SUBMITTY_INSTALL_DIR = OPEN_JSON['submitty_install_dir']
22+
23+
24+
def parse_args():
25+
parser = argparse.ArgumentParser(description="")
26+
parser.add_argument("semester")
27+
parser.add_argument("course")
28+
parser.add_argument("gradeable")
29+
parser.add_argument("--window",type=int,default=10)
30+
parser.add_argument("--hash_size",type=int,default=100000)
31+
language = parser.add_mutually_exclusive_group(required=True)
32+
language.add_argument ("--plaintext", action='store_true')
33+
language.add_argument ("--python", action='store_true')
34+
language.add_argument ("--cpp", action='store_true')
35+
36+
args = parser.parse_args()
37+
38+
if (args.window < 1):
39+
print ("ERROR! window must be >= 1")
40+
exit(1)
41+
42+
return args
43+
44+
45+
def hasher(args,my_tokenized_file,my_hashes_file):
46+
with open(my_tokenized_file,'r') as my_tf:
47+
with open(my_hashes_file,'w') as my_hf:
48+
tokens = json.load(my_tf)
49+
num = len(tokens)
50+
for i in range(0,num-args.window):
51+
foo=""
52+
if args.plaintext:
53+
for j in range(0,args.window):
54+
foo+=str(tokens[i+j].get("value"))
55+
elif args.python:
56+
print("NEED A PYTHON HASHER")
57+
elif args.cpp:
58+
print("NEED A C++ HASHER")
59+
else:
60+
print("UNKNOWN HASHER")
61+
hash_object = hashlib.md5(foo.encode())
62+
hash_object_string=hash_object.hexdigest()
63+
#FIXME: this truncation should be adjusted after more full-scale testing
64+
hash_object_string_truncated=hash_object_string[0:4]
65+
#my_hf.write(hash_object_string+"\n")
66+
my_hf.write(hash_object_string_truncated+"\n")
67+
68+
69+
def main():
70+
args = parse_args()
71+
72+
sys.stdout.write("HASH ALL...")
73+
sys.stdout.flush()
74+
75+
# ===========================================================================
76+
# error checking
77+
course_dir=os.path.join(SUBMITTY_DATA_DIR,"courses",args.semester,args.course)
78+
if not os.path.isdir(course_dir):
79+
print("ERROR! ",course_dir," is not a valid course directory")
80+
exit(1)
81+
tokenized_dir=os.path.join(course_dir,"lichen","tokenized",args.gradeable)
82+
if not os.path.isdir(tokenized_dir):
83+
print("ERROR! ",tokenized_dir," is not a valid gradeable tokenized directory")
84+
exit(1)
85+
86+
hashes_dir=os.path.join(course_dir,"lichen","hashes",args.gradeable)
87+
88+
# ===========================================================================
89+
# walk the subdirectories
90+
for user in os.listdir(tokenized_dir):
91+
for version in os.listdir(os.path.join(tokenized_dir,user)):
92+
my_tokenized_file=os.path.join(tokenized_dir,user,version,"tokens.json")
93+
94+
# ===========================================================================
95+
# create the directory
96+
my_hashes_dir=os.path.join(hashes_dir,user,version)
97+
if not os.path.isdir(my_hashes_dir):
98+
os.makedirs(my_hashes_dir)
99+
100+
my_hashes_file=os.path.join(my_hashes_dir,"hashes.txt")
101+
hasher(args,my_tokenized_file,my_hashes_file)
102+
103+
104+
print("done")
105+
106+
if __name__ == "__main__":
107+
main()

bin/process_all.sh

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#!/bin/bash
2+
3+
semester=$1
4+
course=$2
5+
gradeable=$3
6+
7+
/usr/local/submitty/Lichen/bin/concatenate_all.py $semester $course $gradeable
8+
/usr/local/submitty/Lichen/bin/tokenize_all.py $semester $course $gradeable --plaintext
9+
/usr/local/submitty/Lichen/bin/hash_all.py $semester $course $gradeable --window 5 --plaintext
10+
11+
/usr/local/submitty/Lichen/bin/compare_hashes.out $semester $course $gradeable
12+

bin/tokenize_all.py

+84
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Tokenizes the concatenated files.
4+
"""
5+
6+
import argparse
7+
import os
8+
import json
9+
import subprocess
10+
import sys
11+
12+
13+
CONFIG_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', '..', 'config')
14+
with open(os.path.join(CONFIG_PATH, 'submitty.json')) as open_file:
15+
OPEN_JSON = json.load(open_file)
16+
SUBMITTY_DATA_DIR = OPEN_JSON['submitty_data_dir']
17+
SUBMITTY_INSTALL_DIR = OPEN_JSON['submitty_install_dir']
18+
19+
20+
def parse_args():
21+
parser = argparse.ArgumentParser(description="")
22+
parser.add_argument("semester")
23+
parser.add_argument("course")
24+
parser.add_argument("gradeable")
25+
language = parser.add_mutually_exclusive_group(required=True)
26+
language.add_argument ("--plaintext", action='store_true')
27+
language.add_argument ("--python", action='store_true')
28+
language.add_argument ("--cpp", action='store_true')
29+
return parser.parse_args()
30+
31+
32+
def tokenize(args,my_concatenated_file,my_tokenized_file):
33+
34+
if args.plaintext:
35+
tokenizer = os.path.join(SUBMITTY_INSTALL_DIR,"Lichen","bin","plaintext_tokenizer.out")
36+
with open(my_concatenated_file,'r') as infile:
37+
with open (my_tokenized_file,'w')as outfile:
38+
subprocess.call([tokenizer,"--ignore_newlines"],stdin=infile,stdout=outfile)
39+
elif args.python:
40+
print("NEED A PYTHON TOKENIZER")
41+
elif args.cpp:
42+
print("NEED A C++ TOKENIZER")
43+
else:
44+
print("UNKNOWN TOKENIZER")
45+
46+
def main():
47+
args = parse_args()
48+
49+
sys.stdout.write("TOKENIZE ALL...")
50+
sys.stdout.flush()
51+
52+
# ===========================================================================
53+
# error checking
54+
course_dir=os.path.join(SUBMITTY_DATA_DIR,"courses",args.semester,args.course)
55+
if not os.path.isdir(course_dir):
56+
print("ERROR! ",course_dir," is not a valid course directory")
57+
exit(1)
58+
concatenated_dir=os.path.join(course_dir,"lichen","concatenated",args.gradeable)
59+
if not os.path.isdir(concatenated_dir):
60+
print("ERROR! ",concatenated_dir," is not a valid gradeable concatenated directory")
61+
exit(1)
62+
63+
tokenized_dir=os.path.join(course_dir,"lichen","tokenized",args.gradeable)
64+
65+
# ===========================================================================
66+
# walk the subdirectories
67+
for user in os.listdir(concatenated_dir):
68+
for version in os.listdir(os.path.join(concatenated_dir,user)):
69+
my_concatenated_file=os.path.join(concatenated_dir,user,version,"submission.concatenated")
70+
71+
# ===========================================================================
72+
# create the directory
73+
my_tokenized_dir=os.path.join(tokenized_dir,user,version)
74+
if not os.path.isdir(my_tokenized_dir):
75+
os.makedirs(my_tokenized_dir)
76+
77+
my_tokenized_file=os.path.join(my_tokenized_dir,"tokens.json")
78+
tokenize(args,my_concatenated_file,my_tokenized_file)
79+
80+
print ("done")
81+
82+
83+
if __name__ == "__main__":
84+
main()

0 commit comments

Comments
 (0)